In [12]:
import pandas as pd
from surprise import SVD, Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import numpy as np
from surprise.model_selection import GridSearchCV
import collections
import os
import collections


In [13]:
# Load data
train_data = pd.read_csv('y_train.csv')
test_data = pd.read_csv('y_test.csv')
investments = pd.read_csv('../syntheticDataGenerators/investment/invest_data.csv', sep=';')

# Create a basket encoder
from sklearn.preprocessing import LabelEncoder

# Extract all unique basket names from both datasets
all_basket_names = np.union1d(train_data['basket_name'].unique(), 
                              test_data['basket_name'].unique())

# Create and fit the encoder
basket_encoder = LabelEncoder()
basket_encoder.fit(all_basket_names)

# Apply the encoding to get encoded basket values
train_data['basket_encoded'] = basket_encoder.transform(train_data['basket_name'])
test_data['basket_encoded'] = basket_encoder.transform(test_data['basket_name'])

# Get all unique users and baskets (using the encoded values)
unique_users = investments['user_id'].unique()
unique_baskets = np.union1d(train_data['basket_encoded'].unique(), 
                           test_data['basket_encoded'].unique())

# Process training data
# Convert to user-item-rating format
train_ratings = []
for user in train_data['user_id'].unique():
    # Get all baskets this user has invested in
    invested_baskets = train_data[train_data['user_id'] == user]['basket_encoded'].values
    
    # For each basket, add a rating of 1 (user invested)
    for basket in invested_baskets:
        train_ratings.append({'user_id': user, 'basket_encoded': basket, 'binary_rating': 1})
    
    # For baskets the user hasn't invested in, add a rating of 0
    non_invested = [b for b in unique_baskets if b not in invested_baskets]
    for basket in non_invested:
        train_ratings.append({'user_id': user, 'basket_encoded': basket, 'binary_rating': 0})

train_df = pd.DataFrame(train_ratings)

# Process test data similarly
test_ratings = []
for user in test_data['user_id'].unique():
    # Get all baskets this user has invested in
    invested_baskets = test_data[test_data['user_id'] == user]['basket_encoded'].values
    
    # For each basket, add a rating of 1 (user invested)
    for basket in invested_baskets:
        test_ratings.append({'user_id': user, 'basket_encoded': basket, 'binary_rating': 1})
    
    # For baskets the user hasn't invested in, add a rating of 0
    non_invested = [b for b in unique_baskets if b not in invested_baskets]
    for basket in non_invested:
        test_ratings.append({'user_id': user, 'basket_encoded': basket, 'binary_rating': 0})

test_df = pd.DataFrame(test_ratings)

# Create Surprise datasets
reader = Reader(rating_scale=(0, 1))

trainset = Dataset.load_from_df(
    train_df[['user_id', 'basket_encoded', 'binary_rating']], 
    reader
).build_full_trainset()

# For the testset, convert to (user, item, rating) tuples
testset = [(uid, iid, r) for uid, iid, r in 
           test_df[['user_id', 'basket_encoded', 'binary_rating']].itertuples(index=False)]

#  Grid search for hyperparameter tuning
param_grid = {
    "n_epochs": [10, 20, 40], 
    "lr_all": [0.001, 0.002, 0.005], 
    "reg_all": [0.001, 0.002, 0.004]
}

# Create a full dataset for grid search
full_dataset = Dataset.load_from_df(
    pd.concat([train_df, test_df])[['user_id', 'basket_encoded', 'binary_rating']], 
    reader
)

# Run grid search
gs = GridSearchCV(SVD, param_grid, measures=["mae"], cv=3, refit=True)
gs.fit(full_dataset)

# Get best parameters
best_params = gs.best_params["mae"]
print(f"Best parameters: {best_params}")

# Initialize SVD with best parameters
model_SVD = SVD(
    n_epochs=best_params["n_epochs"],
    lr_all=best_params["lr_all"],
    reg_all=best_params["reg_all"]
)

# Use default parameters for quick testing
# model_SVD = SVD(n_epochs=20, lr_all=0.005, reg_all=0.002)

# Fit the model
model_SVD.fit(trainset)

# Test model
predictions = model_SVD.test(testset)

# Evaluate with RMSE and MAE
print("\nSVD model with binary data:")
print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")
print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
mae = accuracy.mae(predictions)
print(f"MAE: {mae}")

# Make recommendation for a specific user
user_id = 1001  # Choose a user ID to make recommendations for

# Get baskets the user has already invested in
already_invested = train_data[train_data['user_id'] == user_id]['basket_encoded'].unique()
if len(already_invested) == 0:  # Check if user is in test data
    already_invested = test_data[test_data['user_id'] == user_id]['basket_encoded'].unique()

# Get baskets the user hasn't invested in yet
baskets_to_predict = [b for b in unique_baskets if b not in already_invested]

# Predict user's interests for new baskets
user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
# Sort by predicted rating (highest first)
top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:5]

print(f"\nTop 5 recommended baskets for user {user_id}:")
for pred in top_recommendations:
    # Get original basket name for better readability
    basket_name = basket_encoder.inverse_transform([pred.iid])[0]
    print(f"→ Basket {basket_name} (predicted rating: {pred.est:.4f})")

# Function to get recommendations for any user
def get_recommendations(user_id, num_recommendations=5):
    """Get top basket recommendations for a specific user."""
    # Check if user exists in our data
    if user_id not in unique_users:
        print(f"User {user_id} not found in the dataset.")
        return []
    
    # Get baskets the user has already invested in
    already_invested = train_data[train_data['user_id'] == user_id]['basket_encoded'].unique()
    if len(already_invested) == 0:  # Check if user is in test data
        already_invested = test_data[test_data['user_id'] == user_id]['basket_encoded'].unique()
    
    # Get baskets the user hasn't invested in yet
    baskets_to_predict = [b for b in unique_baskets if b not in already_invested]
    
    # Predict user's interests for new baskets
    user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
    # Sort by predicted rating (highest first)
    top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:num_recommendations]
    
    # Return both encoded id and original name for better usability
    return [(pred.iid, basket_encoder.inverse_transform([pred.iid])[0], pred.est) 
            for pred in top_recommendations]

# Example usage of the recommendation function
print("\nTesting recommendation function:")
for test_user in list(unique_users)[:3]:  # Get recommendations for first 3 users
    recommendations = get_recommendations(test_user)
    print(f"Recommendations for user {test_user}:")
    for basket_id, basket_name, rating in recommendations:
        print(f"→ Basket {basket_name} (predicted rating: {rating:.4f})")
    print()

Best parameters: {'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.004}

SVD model with binary data:
RMSE: Root Mean Squared Error. Lower values mean better accuracy.
RMSE: 0.1140
RMSE: 0.11397805102974332
MAE: Mean Absolute Error. Lower values mean better accuracy.
MAE:  0.0344
MAE: 0.034359644808922736

Top 5 recommended baskets for user 1001:
→ Basket Food world 7 (predicted rating: 0.3828)
→ Basket Well traded stocks (predicted rating: 0.2147)
→ Basket Techs going upward (predicted rating: 0.1396)
→ Basket Software in Sweden (predicted rating: 0.1228)
→ Basket Sustainable electric equipment (predicted rating: 0.1212)

Testing recommendation function:
Recommendations for user 1001:
→ Basket Food world 7 (predicted rating: 0.3828)
→ Basket Well traded stocks (predicted rating: 0.2147)
→ Basket Techs going upward (predicted rating: 0.1396)
→ Basket Software in Sweden (predicted rating: 0.1228)
→ Basket Sustainable electric equipment (predicted rating: 0.1212)

Recommendations for user 10

In [28]:
# Dictionary to store precision, recall, and F1 values
precision_at_k = collections.defaultdict(list)
recall_at_k = collections.defaultdict(list)
f1_at_k = collections.defaultdict(list)

# Get all unique test users
test_user_ids = test_data['user_id'].unique()

# For each user in the test set
for user_id in test_user_ids:
    # Find baskets this user has invested in from test data (ground truth)
    user_positive_test_baskets = set(test_data[test_data['user_id'] == user_id]['basket_encoded'])
    
    # If no positive test baskets, skip this user
    if len(user_positive_test_baskets) == 0:
        continue
    
    # Find baskets the user has already invested in from train data
    user_invested_train_baskets = set(train_data[train_data['user_id'] == user_id]['basket_encoded'])
    
    # Baskets to predict (all baskets minus those already invested in from training)
    # Evaluate against ALL possible baskets, including ones they already invested in:
    # baskets_to_predict = [b for b in unique_baskets]
    
    # Exclude baskets they've already invested in from training:
    baskets_to_predict = [b for b in unique_baskets if b not in user_invested_train_baskets]
    
    # Make predictions for all candidate baskets
    user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
    sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
    
    # Calculate precision and recall at different k values
    for k in [1, 2, 3]:
        # Ensure k doesn't exceed number of predictions
        effective_k = min(k, len(sorted_predictions))
        
        # Skip if no predictions
        if effective_k == 0:
            continue
        
        # Get top-k recommended baskets
        top_k_recs = [pred.iid for pred in sorted_predictions[:effective_k]]
        
        # Calculate relevant items among top-k recommendations (positive baskets in test set)
        true_positives = len(set(top_k_recs) & user_positive_test_baskets)
        
        # Precision = relevant recommended / all recommended
        precision = true_positives / effective_k
        
        # Recall = relevant recommended / all relevant
        recall = true_positives / len(user_positive_test_baskets)
        
        # F1 score = 2 * (precision * recall) / (precision + recall)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_at_k[k].append(precision)
        recall_at_k[k].append(recall)
        f1_at_k[k].append(f1)

# Calculate average precision, recall, and F1 for each k
print("\nEvaluation Metrics for Top-K Recommendations:")
for k in [1, 2, 3]:
    avg_precision = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
    avg_recall = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
    avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
    
    print(f"\nMetrics for k={k}:")
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1@{k}: {avg_f1:.4f}")
    
    print(f"Number of users evaluated: {len(precision_at_k[k])}")


Evaluation Metrics for Top-K Recommendations:

Metrics for k=1:
Precision@1: 0.1529
Recall@1: 0.1137
F1@1: 0.1268
Number of users evaluated: 994

Metrics for k=2:
Precision@2: 0.1127
Recall@2: 0.1655
F1@2: 0.1303
Number of users evaluated: 994

Metrics for k=3:
Precision@3: 0.0915
Recall@3: 0.2017
F1@3: 0.1227
Number of users evaluated: 994


In [15]:
# 1. A more efficient KNN implementation
from surprise import KNNWithMeans

# 2. Pre-compute similarities and use a smaller neighborhood
sim_options = {
    'name': 'cosine',
    'user_based': True,  # User-based filtering
    'min_support': 3     # Minimum number of common items before considering similarity
}

# 3. Use pre-computed similarities
model_KNN = KNNWithMeans(k=10, sim_options=sim_options, verbose=False)

# 4. Fit the model (this will now pre-compute similarities)
model_KNN.fit(trainset)

# 5. For faster prediction on large sets of items, use this approach
predictions = []
unique_baskets_list = list(unique_baskets)

# Process in smaller batches instead of one by one
batch_size = 100
for i in range(0, len(unique_baskets_list), batch_size):
    batch = unique_baskets_list[i:i+batch_size]
    batch_predictions = [model_KNN.predict(user_id, basket) for basket in batch]
    predictions.extend(batch_predictions)
    
# 6. Sort predictions afterwards
top_recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:5]

In [11]:
# Dictionary to store precision, recall, and F1 values
precision_at_k = collections.defaultdict(list)
recall_at_k = collections.defaultdict(list)
f1_at_k = collections.defaultdict(list)

# Get all unique test users
test_user_ids = test_data['user_id'].unique()

# For each user in the test set
for user_id in test_user_ids:
    # Find baskets this user has invested in from test data (ground truth)
    user_positive_test_baskets = set(test_data[test_data['user_id'] == user_id]['basket_encoded'])
    
    # If no positive test baskets, skip this user
    if len(user_positive_test_baskets) == 0:
        continue
    
    # Find baskets the user has already invested in from train data
    user_invested_train_baskets = set(train_data[train_data['user_id'] == user_id]['basket_encoded'])
    
    # Baskets to predict (all baskets minus those already invested in from training)
    # To evaluate against ALL possible baskets, including ones they already invested in:
    # baskets_to_predict = [b for b in unique_baskets]
    
    # Exclude baskets they've already invested in from training:
    baskets_to_predict = [b for b in unique_baskets if b not in user_invested_train_baskets]
    
    # Make predictions for all candidate baskets
    user_predictions = [model_KNN.predict(user_id, basket) for basket in baskets_to_predict]
    sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
    
    # Calculate precision and recall at different k values
    for k in [1, 2, 3]:
        # Ensure k doesn't exceed number of predictions
        effective_k = min(k, len(sorted_predictions))
        
        # Skip if no predictions
        if effective_k == 0:
            continue
        
        # Get top-k recommended baskets
        top_k_recs = [pred.iid for pred in sorted_predictions[:effective_k]]
        
        # Calculate relevant items among top-k recommendations (positive baskets in test set)
        true_positives = len(set(top_k_recs) & user_positive_test_baskets)
        
        # Precision = relevant recommended / all recommended
        precision = true_positives / effective_k
        
        # Recall = relevant recommended / all relevant
        recall = true_positives / len(user_positive_test_baskets)
        
        # F1 score = 2 * (precision * recall) / (precision + recall)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_at_k[k].append(precision)
        recall_at_k[k].append(recall)
        f1_at_k[k].append(f1)

# Calculate average precision, recall, and F1 for each k
print("\nEvaluation Metrics for Top-K Recommendations:")
for k in [1, 2, 3]:
    avg_precision = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
    avg_recall = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
    avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
    
    print(f"\nMetrics for k={k}:")
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1@{k}: {avg_f1:.4f}")
    
    print(f"Number of users evaluated: {len(precision_at_k[k])}")


Evaluation Metrics for Top-K Recommendations:

Metrics for k=1:
Precision@1: 0.1861
Recall@1: 0.1343
F1@1: 0.1516
Number of users evaluated: 994

Metrics for k=2:
Precision@2: 0.1459
Recall@2: 0.2108
F1@2: 0.1675
Number of users evaluated: 994

Metrics for k=3:
Precision@3: 0.1234
Recall@3: 0.2681
F1@3: 0.1647
Number of users evaluated: 994
