In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import collections

In [6]:
# Load your data
investments = pd.read_csv("syntheticDataGenerators/investment/invest_data.csv", sep=';')

# Normalize ratings between 0 and 1 per user
user_totals = investments.groupby('user_id')['investment_amount'].sum()
investments['rating'] = investments.apply(lambda row: row['investment_amount'] / user_totals[row['user_id']], axis=1)

# Encode user and basket IDs to integers
user2idx = {user_id: idx for idx, user_id in enumerate(investments['user_id'].unique())}
basket2idx = {basket: idx for idx, basket in enumerate(investments['basket_name'].unique())}

investments['user_idx'] = investments['user_id'].map(user2idx)
investments['basket_idx'] = investments['basket_name'].map(basket2idx)

In [7]:
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=50):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc = nn.Sequential(
            nn.Linear(embedding_size * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, user_ids, item_ids):
        user_vecs = self.user_embedding(user_ids)
        item_vecs = self.item_embedding(item_ids)
        combined = torch.cat([user_vecs, item_vecs], dim=1)
        return self.fc(combined).squeeze()

In [10]:
# Prepare data
X = investments[['user_idx', 'basket_idx']].values
y = investments['rating'].values.astype(np.float32)

# Create user and basket index mappings
user2idx = {user: idx for idx, user in enumerate(investments['user_id'].unique())}
basket2idx = {basket: idx for idx, basket in enumerate(investments['basket_name'].unique())}

# Add index columns to your dataframe
investments['user_idx'] = investments['user_id'].map(user2idx)
investments['basket_idx'] = investments['basket_name'].map(basket2idx)

# Get unique list of users and sort them
unique_users = investments['user_id'].unique()
sorted_users = np.sort(unique_users)

# Calculate the split point (80% of users)
split_idx = int(len(sorted_users) * 0.8)

# Split users into train and test sets
train_users_ids = sorted_users[:split_idx]
test_users_ids = sorted_users[split_idx:]

# Filter data by user groups
train_data = investments[investments['user_id'].isin(train_users_ids)]
test_data = investments[investments['user_id'].isin(test_users_ids)]

# Prepare features and targets for PyTorch
X_train = train_data[['user_idx', 'basket_idx']].values
y_train = train_data['rating'].values.astype(np.float32)

X_test = test_data[['user_idx', 'basket_idx']].values
y_test = test_data['rating'].values.astype(np.float32)

# Convert to tensors
train_users = torch.tensor(X_train[:, 0])
train_items = torch.tensor(X_train[:, 1])
train_ratings = torch.tensor(y_train)

test_users = torch.tensor(X_test[:, 0])
test_items = torch.tensor(X_test[:, 1])
test_ratings = torch.tensor(y_test)

# Model
model = RecommenderNet(num_users=len(user2idx), num_items=len(basket2idx))
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(train_users, train_items)
    loss = criterion(outputs, train_ratings)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_preds = model(test_users, test_items)
        val_loss = criterion(val_preds, test_ratings)
    
    print(f"Epoch {epoch+1}: Train Loss = {loss.item():.4f}, Val Loss = {val_loss.item():.4f}")

Epoch 1: Train Loss = 0.1315, Val Loss = 0.0732
Epoch 2: Train Loss = 0.0691, Val Loss = 0.0376
Epoch 3: Train Loss = 0.0334, Val Loss = 0.0203
Epoch 4: Train Loss = 0.0173, Val Loss = 0.0140
Epoch 5: Train Loss = 0.0125, Val Loss = 0.0129
Epoch 6: Train Loss = 0.0125, Val Loss = 0.0136
Epoch 7: Train Loss = 0.0140, Val Loss = 0.0145
Epoch 8: Train Loss = 0.0155, Val Loss = 0.0152
Epoch 9: Train Loss = 0.0163, Val Loss = 0.0154
Epoch 10: Train Loss = 0.0164, Val Loss = 0.0151


In [11]:
def recommend_for_users(user_ids, top_n=5):
    model.eval()
    all_baskets = list(basket2idx.keys())
    
    for user_id in user_ids:
        user_idx = user2idx[user_id]
        
        already_invested = investments[investments['user_id'] == user_id]['basket_name'].unique()
        unseen_baskets = [b for b in all_baskets if b not in already_invested]
        basket_indices = torch.tensor([basket2idx[b] for b in unseen_baskets])
        user_tensor = torch.tensor([user_idx] * len(basket_indices))

        with torch.no_grad():
            preds = model(user_tensor, basket_indices)
        
        top_indices = torch.topk(preds, top_n).indices
        top_baskets = [(unseen_baskets[i], preds[i].item()) for i in top_indices]

        print(f"\n🔍 Top-{top_n} Recommendations for User {user_id}:")
        for basket, score in top_baskets:
            print(f"→ {basket} (score: {score:.2f})")


In [20]:
random_user_ids = np.random.choice(list(user2idx.keys()), size=10, replace=False)
recommend_for_users([1001, 1002, 1003, 1004, 1005])


🔍 Top-5 Recommendations for User 1001:
→ Danish contracts (score: 0.22)
→ European fishing (score: 0.22)
→ Nordic healthcare Facilities (score: 0.21)
→ Pers 14 (score: 0.21)
→ Great World software (score: 0.19)

🔍 Top-5 Recommendations for User 1002:
→ Danish contracts (score: 0.38)
→ Great World software (score: 0.36)
→ Aussi Fin extra (score: 0.33)
→ Consumer loans (score: 0.32)
→ Healthcare southern Europé (score: 0.31)

🔍 Top-5 Recommendations for User 1003:
→ Danish contracts (score: 0.37)
→ Great World software (score: 0.34)
→ Nordic healthcare Facilities (score: 0.31)
→ Nordic Construction Companies (score: 0.30)
→ Aussi Fin extra (score: 0.29)

🔍 Top-5 Recommendations for User 1004:
→ Danish contracts (score: 0.32)
→ Aussi Fin extra (score: 0.29)
→ Why? Dont get it (score: 0.25)
→ Healtcare Global valuers (score: 0.23)
→ Nordic healthcare Facilities (score: 0.19)

🔍 Top-5 Recommendations for User 1005:
→ Danish contracts (score: 0.29)
→ Aussi Fin extra (score: 0.21)
→ European

In [18]:
def evaluate_recommendations(user_ids=None, top_n_values=[2, 5, 10]):
    """
    Evaluate the model using precision, recall, and F1 score
    for multiple values of top_n recommendations
    """
    model.eval()
    all_baskets = list(basket2idx.keys())
    
    # Use all test users if no specific users are provided
    if user_ids is None:
        user_ids = test_data['user_id'].unique()
    
    # Dictionaries to store metrics
    precision_at_k = collections.defaultdict(list)
    recall_at_k = collections.defaultdict(list)
    f1_at_k = collections.defaultdict(list)
    
    for user_id in user_ids:
        # Skip if user is not in test set
        if user_id not in test_data['user_id'].unique():
            continue
            
        # Get user index
        user_idx = user2idx[user_id]
        
        # Get baskets from training set (to exclude)
        if user_id in train_data['user_id'].unique():
            train_baskets = train_data[train_data['user_id'] == user_id]['basket_name'].unique()
        else:
            train_baskets = []
            
        # Get ground truth baskets from test set
        test_baskets = test_data[test_data['user_id'] == user_id]['basket_name'].unique()
        
        # Skip if no test baskets
        if len(test_baskets) == 0:
            continue
            
        # Get unseen baskets (all baskets minus training baskets)
        basket_indices = torch.tensor([basket2idx[b]for b in all_baskets])
        user_tensor = torch.tensor([user_idx] * len(basket_indices))
        
        # Get predictions
        with torch.no_grad():
            preds = model(user_tensor, basket_indices)
        
        # Evaluate for different k values
        for k in top_n_values:
                
            # Get top-k recommendations
            top_indices = torch.topk(preds, k).indices.numpy()
            recommended_baskets = [all_baskets[i] for i in top_indices]
            
            # Calculate true positives (recommendations that are in test set)
            true_positives = len(set(recommended_baskets) & set(test_baskets))
            
            # Calculate metrics
            precision = true_positives / k
            recall = true_positives / len(test_baskets)
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
            
            # Store metrics
            precision_at_k[k].append(precision)
            recall_at_k[k].append(recall)
            f1_at_k[k].append(f1)
    
    # Calculate average metrics
    print("\n----- EVALUATION RESULTS -----")
    for k in top_n_values:
        if not precision_at_k[k]:
            print(f"No data for k={k}")
            continue
            
        avg_precision = np.mean(precision_at_k[k])
        avg_recall = np.mean(recall_at_k[k])
        avg_f1 = np.mean(f1_at_k[k])
        
        print(f"\nMetrics at k={k}:")
        print(f"Precision@{k}: {avg_precision:.4f}")
        print(f"Recall@{k}: {avg_recall:.4f}")
        print(f"F1@{k}: {avg_f1:.4f}")
    
    return {
        'precision': precision_at_k,
        'recall': recall_at_k,
        'f1': f1_at_k
    }

# You can call this function to evaluate all test users
metrics = evaluate_recommendations(top_n_values=[2, 5, 10])


----- EVALUATION RESULTS -----

Metrics at k=2:
Precision@2: 0.0125
Recall@2: 0.0048
F1@2: 0.0068

Metrics at k=5:
Precision@5: 0.0150
Recall@5: 0.0152
F1@5: 0.0149

Metrics at k=10:
Precision@10: 0.0105
Recall@10: 0.0203
F1@10: 0.0136
