In [140]:
import pandas as pd
from surprise import SVD, Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import numpy as np
import collections

In [129]:
#USING DEFAULT TRAIN-TEST DATASET DIVISION IN SURPRISE
# 1. Data loading
investments = pd.read_csv('syntheticDataGenerators/investment/invest_data.csv', sep=';')

# 2. Calculate total investment amount for each user
user_totals = investments.groupby('user_id')['investment_amount'].sum()

# 3. Create a new column for normalized ratings
investments['normalized_rating'] = investments.apply(
    lambda row: row['investment_amount'] / user_totals[row['user_id']], 
    axis=1
)

# 4. Now use these normalized ratings with Surprise
reader = Reader(rating_scale=(0, 1))  # Since our ratings are now between 0 and 1
data = Dataset.load_from_df(investments[['user_id', 'basket_name', 'normalized_rating']], reader)

# 5. Divide data into training och test(20%) dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 6. Train SVD-model
model_SVD = SVD()
model_SVD.fit(trainset)

# 7. Test model
predictions = model_SVD.test(testset)

# 8. Evaluate with RMSE, MAE and FCP
print("SVD model:")
print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
rmse = accuracy.rmse(predictions)
print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
mae = accuracy.mae(predictions)
print("FCP:Fraction of Concordant Pairs. Higher values mean better accuracy.")
fcp = accuracy.fcp(predictions)

# 9. Make recommendation to a specific user
user_id = 1001
all_baskets = investments['basket_name'].unique()
already_invested = investments[investments['user_id'] == user_id]['basket_name'].unique()
baskets_to_predict = [b for b in all_baskets if b not in already_invested]

# 10. Predict user's intresses for new baskets
user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:5]

print(f"\nTopp 5 rekommenderade baskets för användare {user_id}:")
for pred in top_recommendations:
    print(f"→ {pred.iid} (förväntad rating: {pred.est:.2f})")

SVD model:
RMSE: Root Mean Squared Error. Lower values mean better accuracy.
RMSE: 0.0966
MAE: Mean Absolute Error. Lower values mean better accuracy.
MAE:  0.0758
FCP:Fraction of Concordant Pairs. Higher values mean better accuracy.
FCP:  0.5121

Topp 5 rekommenderade baskets för användare 1001:
→ High profits with growth (förväntad rating: 0.43)
→ German Broadcasting (förväntad rating: 0.41)
→ Australian Health 2 (förväntad rating: 0.41)
→ Software Australian small comp (förväntad rating: 0.40)
→ Techs going upward (förväntad rating: 0.39)


In [130]:
from surprise.model_selection import GridSearchCV

#USING MANUAL TRAIN-TEST DATASET DIVISION: 80% user data used for tarining, 20% user used for test
# Data loading
investments = pd.read_csv('syntheticDataGenerators/investment/invest_data.csv', sep=';')

# Calculate total investment amount for each user
user_totals = investments.groupby('user_id')['investment_amount'].sum()

# Create a new column for normalized ratings
investments['normalized_rating'] = investments.apply(
    lambda row: row['investment_amount'] / user_totals[row['user_id']], axis=1
)

# Now use these normalized ratings with Surprise
reader = Reader(rating_scale=(0, 1))   # Since our ratings are now between 0 and 1

# Get unique list of users
unique_users = investments['user_id'].unique()

# Sort users
sorted_users = np.sort(unique_users)

# Calculate the split point (80% of users)
split_idx = int(len(sorted_users) * 0.8)

# Get training and testing user sets
train_users = sorted_users[:split_idx]
test_users = sorted_users[split_idx:]

# Filter data by user groups
train_data = investments[investments['user_id'].isin(train_users)]
test_data = investments[investments['user_id'].isin(test_users)]

# Now create the Surprise datasets with the normalized ratings
trainset = Dataset.load_from_df(
    train_data[['user_id', 'basket_name', 'normalized_rating']], 
    reader
).build_full_trainset()

# For the testset, we need to convert it to the proper format for testing
# (user, item, rating) tuples
testset = [(uid, iid, r) for uid, iid, r in 
           test_data[['user_id', 'basket_name', 'normalized_rating']].itertuples(index=False)]

param_grid = {"n_epochs": [10, 20, 40, 80], "lr_all": [0.0005, 0.001, 0.002, 0.005], "reg_all": [0.0005, 0.001, 0.002, 0.004]}
gs = GridSearchCV(SVD, param_grid, measures=["mae"], cv=3, refit=True)
gs.fit(data)

model_SVD = SVD()
model_SVD.fit(trainset)

# Test model
predictions = model_SVD.test(testset)

# Evaluate with RMSE, MAE and FCP
print("SVD model:")
print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
rmse = accuracy.rmse(predictions)
print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
mae = accuracy.mae(predictions)

# Make recommendation to a specific user
user_id = 1001
all_baskets = investments['basket_name'].unique()
already_invested = investments[investments['user_id'] == user_id]['basket_name'].unique()
baskets_to_predict = [b for b in all_baskets if b not in already_invested]

# Predict user's intresses for new baskets
user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:5]

print(f"\nTopp 5 rekommenderade baskets för användare {user_id}:")
for pred in top_recommendations:
    print(f"→ {pred.iid} (förväntad rating: {pred.est:.2f})")

SVD model:
RMSE: Root Mean Squared Error. Lower values mean better accuracy.
RMSE: 0.0679
MAE: Mean Absolute Error. Lower values mean better accuracy.
MAE:  0.0537

Topp 5 rekommenderade baskets för användare 1001:
→ Cross over well performing (förväntad rating: 0.41)
→ Software Americas, small mcap (förväntad rating: 0.37)
→ Swedish tech (förväntad rating: 0.34)
→ French companies (förväntad rating: 0.33)
→ French steel (förväntad rating: 0.33)


In [124]:
# Dictionary to store precision and recall values
precision_at_k = collections.defaultdict(list)
recall_at_k = collections.defaultdict(list)
f1_at_k = collections.defaultdict(list)

# Get all unique test users
test_user_ids = np.unique([uid for uid, _, _ in testset])

# Get all unique baskets
all_baskets = investments['basket_name'].unique()

# For each user in the test set
for user_id in test_user_ids:    
    # Find baskets this user has interacted with in test data (ground truth)
    user_test_data = test_data[test_data['user_id'] == user_id]
    user_test_baskets = set(user_test_data['basket_name'].unique())
    
    # If no test baskets, skip this user
    if len(user_test_baskets) == 0:
        continue
        
    # Find baskets to predict for
    baskets_to_predict = [b for b in all_baskets]
    
    # Make predictions for all baskets
    user_predictions = [model_SVD.predict(user_id, basket) for basket in baskets_to_predict]
    sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
    
    # Calculate precision and recall at different k values
    for k in [2, 5, 8]:
        # Get top-k recommended baskets
        top_k_recs = [pred.iid for pred in sorted_predictions[:k]]
        
        # Calculate relevant items among top-k recommendations (items that appear in test set)
        true_positives = len(set(top_k_recs) & user_test_baskets)
        
        # Precision = relevant recommended / all recommended
        precision = true_positives / k
        
        # Recall = relevant recommended / all relevant
        recall = true_positives / len(user_test_baskets) 

        # F1 score = 2 * (precision * recall) / (precision + recall)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_at_k[k].append(precision)
        recall_at_k[k].append(recall)
        f1_at_k[k].append(f1)

# Calculate average precision and recall
for k in [2, 5, 8]:
    avg_precision = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
    avg_recall = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
    avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
    
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1@{k}: {avg_f1:.4f}")

Precision@2: 0.1734
Recall@2: 0.0630
F1@2: 0.0916
Precision@5: 0.1487
Recall@5: 0.1408
F1@5: 0.1426
Precision@8: 0.1149
Recall@8: 0.1737
F1@8: 0.1364


In [131]:
# 6. Train KNN-model
model_KNN = KNNBasic(k=10)
model_KNN.fit(trainset)

# 7. Test model
predictions = model_KNN.test(testset)

# 8. Evaluate with RMSE, MAE and FCP
print("KNN model:")
print("RMSE: Root Mean Squared Error. Lower values mean better accuracy.")
rmse = accuracy.rmse(predictions)
print("MAE: Mean Absolute Error. Lower values mean better accuracy.")
mae = accuracy.mae(predictions)
print("FCP:Fraction of Concordant Pairs. Higher values mean better accuracy.")
fcp = accuracy.fcp(predictions)

# 9. Make recommendation to a specific user
user_id = 1001
all_baskets = investments['basket_name'].unique()
already_invested = investments[investments['user_id'] == user_id]['basket_name'].unique()
baskets_to_predict = [b for b in all_baskets if b not in already_invested]

# 10. Predict user's intresses for new baskets
user_predictions = [model_KNN.predict(user_id, basket) for basket in baskets_to_predict]
top_recommendations = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:5]

print(f"\nTopp 5 rekommenderade baskets för användare {user_id}:")
for pred in top_recommendations:
    print(f"→ {pred.iid} (förväntad rating: {pred.est:.2f})")

Computing the msd similarity matrix...
Done computing similarity matrix.
KNN model:
RMSE: Root Mean Squared Error. Lower values mean better accuracy.
RMSE: 0.0755
MAE: Mean Absolute Error. Lower values mean better accuracy.
MAE:  0.0602
FCP:Fraction of Concordant Pairs. Higher values mean better accuracy.
FCP:  0.0000

Topp 5 rekommenderade baskets för användare 1001:
→ MooseBit underdog (förväntad rating: 0.30)
→ Technology stars of value (förväntad rating: 0.28)
→ Well traded with profit margin (förväntad rating: 0.28)
→ Great World software (förväntad rating: 0.25)
→ Well performed companies (förväntad rating: 0.25)


In [119]:
# Dictionary to store precision and recall values
precision_at_k = collections.defaultdict(list)
recall_at_k = collections.defaultdict(list)
f1_at_k = collections.defaultdict(list)

# Get all unique test users
test_user_ids = np.unique([uid for uid, _, _ in testset])

# Get all unique baskets
all_baskets = investments['basket_name'].unique()

# For each user in the test set
for user_id in test_user_ids:    
    # Find baskets this user has interacted with in test data (ground truth)
    user_test_data = test_data[test_data['user_id'] == user_id]
    user_test_baskets = set(user_test_data['basket_name'].unique())
    
    # If no test baskets, skip this user
    if len(user_test_baskets) == 0:
        continue
        
    # Find baskets to predict for
    baskets_to_predict = [b for b in all_baskets]
    
    # Make predictions for all baskets
    user_predictions = [model_KNN.predict(user_id, basket) for basket in baskets_to_predict]
    sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
    
    # Calculate precision and recall at different k values
    for k in [1, 2, 3]:
        # Get top-k recommended baskets
        top_k_recs = [pred.iid for pred in sorted_predictions[:k]]
        
        # Calculate relevant items among top-k recommendations (items that appear in test set)
        true_positives = len(set(top_k_recs) & user_test_baskets)
        
        # Precision = relevant recommended / all recommended
        precision = true_positives / k
        
        # Recall = relevant recommended / all relevant
        recall = true_positives / len(user_test_baskets) 

        # F1 score = 2 * (precision * recall) / (precision + recall)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_at_k[k].append(precision)
        recall_at_k[k].append(recall)
        f1_at_k[k].append(f1)

# Calculate average precision and recall
for k in [1, 2, 3]:
    avg_precision = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
    avg_recall = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
    avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
    
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1@{k}: {avg_f1:.4f}")

Precision@1: 0.1500
Recall@1: 0.0283
F1@1: 0.0471
Precision@2: 0.1050
Recall@2: 0.0412
F1@2: 0.0582
Precision@3: 0.1983
Recall@3: 0.1165
F1@3: 0.1448
