In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# --- Load data ---
investments = pd.read_csv("syntheticDataGenerators/investment/invest_data.csv", sep=';')
user_data = pd.read_csv("syntheticDataGenerators/user/swedish_users.csv")
basket_features = pd.read_csv("basket_features.csv")

# --- Create positive samples ---
positive_samples = investments[['user_id', 'basket_name']].copy()
positive_samples['label'] = 1

# --- Create negative samples ---
all_users = user_data['user_id'].unique()
all_baskets = investments['basket_name'].unique()

import itertools
all_user_basket_pairs = pd.DataFrame(itertools.product(all_users, all_baskets), columns=['user_id', 'basket_name'])

# Left join to find missing (negative) user-basket pairs
merged = pd.merge(all_user_basket_pairs, positive_samples[['user_id', 'basket_name']], 
                  on=['user_id', 'basket_name'], how='left', indicator=True)

negative_samples = merged[merged['_merge'] == 'left_only'][['user_id', 'basket_name']]
negative_samples['label'] = 0

# Sample same number of negatives as positives
negative_samples = negative_samples.sample(n=len(positive_samples), random_state=42)

# Combine and shuffle
dataset = pd.concat([positive_samples, negative_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

# --- Merge user features ---
dataset = dataset.merge(user_data, on='user_id', how='left')

# --- Merge basket features ---
dataset = dataset.merge(basket_features, on='basket_name', how='left')

# --- Preprocess features ---
# Categorical and numerical feature names
categorical_cols = ['gender', 'education', 'invest_goal']
numerical_cols = ['age'] + list(basket_features.columns[1:])  # assume all other basket feature columns are numeric

# One-hot encode categorical, pass through numerical
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', 'passthrough', numerical_cols)
])

# --- Split data ---
X = dataset[categorical_cols + numerical_cols]
y = dataset['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train Random Forest ---
clf = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
clf.fit(X_train, y_train)

# --- Evaluate ---
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.83      0.85      1064
           1       0.84      0.87      0.85      1056

    accuracy                           0.85      2120
   macro avg       0.85      0.85      0.85      2120
weighted avg       0.85      0.85      0.85      2120



In [2]:
# --- Recommend baskets for each user ---

# 1. Create all possible (user, basket) combinations
user_ids = user_data['user_id'].unique()
basket_names = basket_features['basket_name'].unique()
all_combinations = pd.DataFrame(itertools.product(user_ids, basket_names), columns=['user_id', 'basket_name'])

# 2. Remove baskets user already invested in
invested_pairs = investments[['user_id', 'basket_name']]
recommendation_candidates = pd.merge(all_combinations, invested_pairs, 
                                     on=['user_id', 'basket_name'], how='left', indicator=True)
recommendation_candidates = recommendation_candidates[recommendation_candidates['_merge'] == 'left_only']
recommendation_candidates = recommendation_candidates.drop(columns=['_merge'])

# 3. Add user + basket features
rec_data = recommendation_candidates.merge(user_data, on='user_id', how='left')
rec_data = rec_data.merge(basket_features, on='basket_name', how='left')

# 4. Predict investment probabilities
X_rec = rec_data[categorical_cols + numerical_cols]
proba = clf.predict_proba(X_rec)[:, 1]  # probability of class '1' (will invest)

rec_data['predicted_score'] = proba

# 5. Get top 5 baskets for each user
top_recommendations = rec_data.groupby('user_id').apply(lambda df: df.nlargest(5, 'predicted_score')).reset_index(drop=True)

# 6. Display
for user_id in top_recommendations['user_id'].unique():
    user_recs = top_recommendations[top_recommendations['user_id'] == user_id]
    print(f"\n🔍 Recommendations for user {user_id}:")
    for _, row in user_recs.iterrows():
        print(f"→ {row['basket_name']} (score: {row['predicted_score']:.2f})")


  top_recommendations = rec_data.groupby('user_id').apply(lambda df: df.nlargest(5, 'predicted_score')).reset_index(drop=True)



🔍 Recommendations for user 1001:
→ Australian Health (score: 1.00)
→ Global healthcare I (score: 1.00)
→ Financial World  nu funds (score: 0.99)
→ Real estate Europe (score: 0.89)
→ Austria blended companies (score: 0.87)

🔍 Recommendations for user 1002:
→ Swedish Top Mix (score: 0.99)
→ Swedish climbers (score: 0.99)
→ Global healthcare (score: 0.97)
→ Austria blended companies (score: 0.95)
→ Renewable energy world (score: 0.93)

🔍 Recommendations for user 1003:
→ Austria blended companies (score: 1.00)
→ Food world 5 (score: 0.99)
→ Food world 6 (score: 0.99)
→ Food world 7 (score: 0.99)
→ Renewable energy world ALL (score: 0.99)

🔍 Recommendations for user 1004:
→ Most traded stocks (score: 1.00)
→ Pers Superblend my best choice (score: 1.00)
→ Swedish mibile tech adjusted (score: 1.00)
→ Finnish tech (score: 0.98)
→ Blockchain and crypto (score: 0.97)

🔍 Recommendations for user 1005:
→ Pers Superblend my best choice (score: 1.00)
→ Growth Rockets (score: 0.99)
→ Technology star