In [4]:
import pandas as pd
from surprise import SVD, Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import numpy as np
from surprise.model_selection import GridSearchCV
import collections

In [5]:
#USING MANUAL TRAIN-TEST DATASET DIVISION: 80% user data used for tarining, 20% user used for test
# Data loading
investments = pd.read_csv('syntheticDataGenerators/investment/invest_data.csv', sep=';')
baskets = pd.read_csv('data/company_basket.csv', sep=';')

# Get all unique users and baskets
unique_users = investments['user_id'].unique()
unique_baskets = baskets['basket_name'].unique()

# Create an empty dataframe with all possible user-basket combinations
all_combinations = []
for user in unique_users:
    for basket in unique_baskets:
        all_combinations.append({'user_id': user, 'basket_name': basket})

# Convert to DataFrame
complete_matrix = pd.DataFrame(all_combinations)

# Create a set of (user_id, basket_name) tuples for quick lookup
invested_pairs = set(zip(investments['user_id'], investments['basket_name']))

# Add binary rating column (1 if user invested in basket, 0 otherwise)
complete_matrix['binary_rating'] = complete_matrix.apply(
    lambda row: 1 if (row['user_id'], row['basket_name']) in invested_pairs else 0, 
    axis=1
)

# Verify our matrix
print(f"Complete matrix shape: {complete_matrix.shape}")
print(f"Number of 1s (investments): {complete_matrix['binary_rating'].sum()}")
print(f"Number of 0s (no investments): {len(complete_matrix) - complete_matrix['binary_rating'].sum()}")
print(f"Matrix sparsity: {(1 - complete_matrix['binary_rating'].mean()) * 100:.2f}%")

# Now use these binary ratings with Surprise
reader = Reader(rating_scale=(0, 1))

# Get unique list of users
sorted_users = np.sort(unique_users)

# Calculate the split point (80% of users)
split_idx = int(len(sorted_users) * 0.8)

# Get training and testing user sets
train_users = sorted_users[:split_idx]
test_users = sorted_users[split_idx:]

# Filter data by user groups
train_data = complete_matrix[complete_matrix['user_id'].isin(train_users)]
test_data = complete_matrix[complete_matrix['user_id'].isin(test_users)]

# Now create the Surprise datasets with the binary ratings
trainset = Dataset.load_from_df(
    train_data[['user_id', 'basket_name', 'binary_rating']], 
    reader
).build_full_trainset()

# For the testset, we need to convert it to the proper format for testing
# (user, item, rating) tuples
testset = [(uid, iid, r) for uid, iid, r in 
           test_data[['user_id', 'basket_name', 'binary_rating']].itertuples(index=False)]

# Optional: Grid search for hyperparameter tuning
param_grid = {
    "n_epochs": [10, 20, 40, 80], 
    "lr_all": [0.0005, 0.001, 0.002, 0.005], 
    "reg_all": [0.0005, 0.001, 0.002, 0.004]
}

# Create a full dataset for grid search
full_dataset = Dataset.load_from_df(
    complete_matrix[['user_id', 'basket_name', 'binary_rating']], 
    reader
)

# Run grid search
gs = GridSearchCV(SVD, param_grid, measures=["mae"], cv=3, refit=True)
gs.fit(full_dataset)

# Get best parameters
best_params = gs.best_params["mae"]
print(f"Best parameters: {best_params}")

# Initialize SVD with best parameters
model_SVD = SVD(
    n_epochs=best_params["n_epochs"],
    lr_all=best_params["lr_all"],
    reg_all=best_params["reg_all"]
)
# Alternative: Use default parameters
# model_SVD = SVD()

# Fit the model
model_SVD.fit(trainset)

# Test model
predictions = model_SVD.test(testset)

# Evaluate with RMSE and MAE
print("SVD model with binary data:")
print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")
print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
mae = accuracy.mae(predictions)
print(f"MAE: {mae}")

# Make recommendation to a specific user
user_id = 1001  # Choose a user ID to make recommendations for
# Get baskets the user has already invested in
already_invested = investments[investments['user_id'] == user_id]['basket_name'].unique()
# Get baskets the user hasn't invested in yet
baskets_to_predict = [b for b in unique_baskets if b not in already_invested]

# Predict user's interests for new baskets
user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
# Sort by predicted rating (highest first)
top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:5]

print(f"\nTop 5 recommended baskets for user {user_id}:")
for pred in top_recommendations:
    print(f"→ {pred.iid} (expected rating: {pred.est:.2f})")

Complete matrix shape: (197000, 3)
Number of 1s (investments): 5281
Number of 0s (no investments): 191719
Matrix sparsity: 97.32%
Best parameters: {'n_epochs': 80, 'lr_all': 0.005, 'reg_all': 0.004}
SVD model with binary data:
RMSE: Root Mean Squared Error. Lower values mean better accuracy.
RMSE: 0.1504
RMSE: 0.1504384074603101
MAE: Mean Absolute Error. Lower values mean better accuracy.
MAE:  0.0450
MAE: 0.04503373910130982

Top 5 recommended baskets for user 1001:
→ Financial World  nu funds (expected rating: 0.28)
→ Australian Health (expected rating: 0.20)
→ Consumer Global INDEX (expected rating: 0.17)
→ Swedish climbers (expected rating: 0.14)
→ Real estate Europe (expected rating: 0.12)


In [11]:
trainset

<surprise.trainset.Trainset at 0x1226a6f90>

In [12]:
# Dictionary to store precision and recall values
precision_at_k = collections.defaultdict(list)
recall_at_k = collections.defaultdict(list)
f1_at_k = collections.defaultdict(list)

# Get all unique test users
test_user_ids = np.unique([uid for uid, _, _ in testset])

# Get all unique baskets
all_baskets = unique_baskets  # Use the unique_baskets we defined earlier

# For each user in the test set
for user_id in test_user_ids:    
    # Find baskets this user has invested in (binary_rating=1) in test data (ground truth)
    user_test_data = test_data[test_data['user_id'] == user_id]
    user_positive_test_baskets = set(user_test_data[user_test_data['binary_rating'] == 1]['basket_name'])
    
    # If no positive test baskets, skip this user
    if len(user_positive_test_baskets) == 0:
        continue
        
    # Find baskets the user has already invested in (from both train and test data)
    user_invested_baskets = set(
        investments[investments['user_id'] == user_id]['basket_name']
    )
    
    # Find baskets to predict for (all baskets minus those already invested in)
    # Comment out the following line if you want to recommend any basket, including those the user already invested in
    # baskets_to_predict = [b for b in all_baskets if b not in user_invested_baskets]
    
    # Or predict for all baskets
    baskets_to_predict = [b for b in all_baskets]
    
    # Make predictions for all baskets
    user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
    sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
    
    # Calculate precision and recall at different k values
    for k in [2, 5, 10]:
        # Ensure k doesn't exceed number of predictions
        effective_k = min(k, len(sorted_predictions))
        
        # Skip if no predictions
        if effective_k == 0:
            continue
        
        # Get top-k recommended baskets
        top_k_recs = [pred.iid for pred in sorted_predictions[:effective_k]]
        
        # Calculate relevant items among top-k recommendations (positive baskets in test set)
        true_positives = len(set(top_k_recs) & user_positive_test_baskets)
        
        # Precision = relevant recommended / all recommended
        precision = true_positives / effective_k
        
        # Recall = relevant recommended / all relevant
        recall = true_positives / len(user_positive_test_baskets) 

        # F1 score = 2 * (precision * recall) / (precision + recall)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_at_k[k].append(precision)
        recall_at_k[k].append(recall)
        f1_at_k[k].append(f1)

# Calculate average precision and recall
for k in [1, 2, 5]:
    avg_precision = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
    avg_recall = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
    avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
    
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1@{k}: {avg_f1:.4f}")

Precision@1: 0.0000
Recall@1: 0.0000
F1@1: 0.0000
Precision@2: 0.3875
Recall@2: 0.1502
F1@2: 0.2143
Precision@5: 0.3070
Recall@5: 0.2947
F1@5: 0.2966


In [7]:
# Train KNN-model
model_KNN = KNNBasic(k=5)
model_KNN.fit(trainset)

# Test model
predictions = model_KNN.test(testset)

# Evaluate with RMSE, MAE and FCP
print("KNN model:")
print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
rmse = accuracy.rmse(predictions)
print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
mae = accuracy.mae(predictions)

# Make recommendation to a specific user
user_id = 1001
all_baskets = investments['basket_name'].unique()
already_invested = investments[investments['user_id'] == user_id]['basket_name'].unique()
baskets_to_predict = [b for b in all_baskets if b not in already_invested]

# Predict user's intresses for new baskets
user_predictions = [model_KNN.predict(user_id, basket) for basket in baskets_to_predict]
top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:5]

print(f"\nTopp 5 rekommenderade baskets för användare {user_id}:")
for pred in top_recommendations:
    print(f"→ {pred.iid} (förväntad rating: {pred.est:.2f})")

Computing the msd similarity matrix...
Done computing similarity matrix.
KNN model:
RMSE: Root Mean Squared Error. Lower values mean better accuracy.
RMSE: 0.1618
MAE: Mean Absolute Error. Lower values mean better accuracy.
MAE:  0.0522

Topp 5 rekommenderade baskets för användare 1001:
→ Financial World  nu funds (förväntad rating: 0.60)
→ Tech online services World (förväntad rating: 0.20)
→ Australia tech index (förväntad rating: 0.00)
→ BIG INDEX Global (förväntad rating: 0.00)
→ Real estate Europe (förväntad rating: 0.00)


In [11]:
# Dictionary to store precision and recall values
precision_at_k = collections.defaultdict(list)
recall_at_k = collections.defaultdict(list)
f1_at_k = collections.defaultdict(list)

# Get all unique test users
test_user_ids = np.unique([uid for uid, _, _ in testset])

# Get all unique baskets
all_baskets = unique_baskets  # Use the unique_baskets we defined earlier

# For each user in the test set
for user_id in test_user_ids:    
    # Find baskets this user has invested in (binary_rating=1) in test data (ground truth)
    user_test_data = test_data[test_data['user_id'] == user_id]
    user_positive_test_baskets = set(user_test_data[user_test_data['binary_rating'] == 1]['basket_name'])
    
    # If no positive test baskets, skip this user
    if len(user_positive_test_baskets) == 0:
        continue
        
    # Find baskets the user has already invested in (from both train and test data)
    user_invested_baskets = set(
        investments[investments['user_id'] == user_id]['basket_name']
    )
    
    # Find baskets to predict for (all baskets minus those already invested in)
    # Comment out the following line if you want to recommend any basket, including those the user already invested in
    # baskets_to_predict = [b for b in all_baskets if b not in user_invested_baskets]
    
    # Or predict for all baskets
    baskets_to_predict = [b for b in all_baskets]
    
    # Make predictions for all baskets
    user_predictions = [model_KNN.predict(user_id, basket) for basket in baskets_to_predict]
    sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
    
    # Calculate precision and recall at different k values
    for k in [2, 5, 10]:
        # Ensure k doesn't exceed number of predictions
        effective_k = min(k, len(sorted_predictions))
        
        # Skip if no predictions
        if effective_k == 0:
            continue
        
        # Get top-k recommended baskets
        top_k_recs = [pred.iid for pred in sorted_predictions[:effective_k]]
        
        # Calculate relevant items among top-k recommendations (positive baskets in test set)
        true_positives = len(set(top_k_recs) & user_positive_test_baskets)
        
        # Precision = relevant recommended / all recommended
        precision = true_positives / effective_k
        
        # Recall = relevant recommended / all relevant
        recall = true_positives / len(user_positive_test_baskets) 

        # F1 score = 2 * (precision * recall) / (precision + recall)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_at_k[k].append(precision)
        recall_at_k[k].append(recall)
        f1_at_k[k].append(f1)

# Calculate average precision and recall
for k in [2, 5, 10]:
    avg_precision = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
    avg_recall = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
    avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
    
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1@{k}: {avg_f1:.4f}")

Precision@2: 0.0050
Recall@2: 0.0023
F1@2: 0.0031
Precision@5: 0.0030
Recall@5: 0.0028
F1@5: 0.0028
Precision@10: 0.0120
Recall@10: 0.0237
F1@10: 0.0158
