In [64]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import collections
from torch.utils.data import TensorDataset, DataLoader

In [22]:
# Load your data
investments = pd.read_csv('syntheticDataGenerators/investment/invest_data.csv', sep=';')
baskets = pd.read_csv('data/company_basket.csv', sep=';')

# Get all unique users and baskets
unique_users = investments['user_id'].unique()
unique_baskets = baskets['basket_name'].unique()

# Create an empty dataframe with all possible user-basket combinations
all_combinations = []
for user in unique_users:
    for basket in unique_baskets:
        all_combinations.append({'user_id': user, 'basket_name': basket})

# Convert to DataFrame
complete_matrix = pd.DataFrame(all_combinations)

# Create a set of (user_id, basket_name) tuples for quick lookup
invested_pairs = set(zip(investments['user_id'], investments['basket_name']))

# Add binary rating column (1 if user invested in basket, 0 otherwise)
complete_matrix['binary_rating'] = complete_matrix.apply(
    lambda row: 1 if (row['user_id'], row['basket_name']) in invested_pairs else 0, 
    axis=1
)

# Encode user and basket IDs to integers
user2idx = {user_id: idx for idx, user_id in enumerate(complete_matrix['user_id'].unique())}
basket2idx = {basket: idx for idx, basket in enumerate(complete_matrix['basket_name'].unique())}

complete_matrix['user_idx'] = complete_matrix['user_id'].map(user2idx)
complete_matrix['basket_idx'] = complete_matrix['basket_name'].map(basket2idx)

# Single model training

In [54]:
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc = nn.Sequential(
            nn.Linear(embedding_size*2, embedding_size*8),
            nn.ReLU(),
            nn.Linear(embedding_size*8, embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, embedding_size//4),
            nn.ReLU(),
            nn.Linear(embedding_size//4, 1),
            nn.Sigmoid()
        )

    def forward(self, user_ids, item_ids):
        user_vecs = self.user_embedding(user_ids)
        item_vecs = self.item_embedding(item_ids)
        combined = torch.cat([user_vecs, item_vecs], dim=1)
        return self.fc(combined).squeeze()

In [75]:
# Prepare data
X = complete_matrix[['user_idx', 'basket_idx']].values
y = complete_matrix['binary_rating'].values.astype(np.float32)

# Create user and basket index mappings
user2idx = {user: idx for idx, user in enumerate(complete_matrix['user_id'].unique())}
basket2idx = {basket: idx for idx, basket in enumerate(complete_matrix['basket_name'].unique())}

# Add index columns to your dataframe
complete_matrix['user_idx'] = complete_matrix['user_id'].map(user2idx)
complete_matrix['basket_idx'] = complete_matrix['basket_name'].map(basket2idx)

# Get unique list of users and sort them
unique_users = complete_matrix['user_id'].unique()
sorted_users = np.sort(unique_users)

# Calculate the split point (80% of users)
split_idx = int(len(sorted_users) * 0.8)

# Split users into train and test sets
train_users_ids = sorted_users[:split_idx]
test_users_ids = sorted_users[split_idx:]

# Filter data by user groups
train_data = complete_matrix[complete_matrix['user_id'].isin(train_users_ids)]
test_data = complete_matrix[complete_matrix['user_id'].isin(test_users_ids)]

# Prepare features and targets for PyTorch
X_train = train_data[['user_idx', 'basket_idx']].values
y_train = train_data['binary_rating'].values.astype(np.float32)

X_test = test_data[['user_idx', 'basket_idx']].values
y_test = test_data['binary_rating'].values.astype(np.float32)

# Convert to tensors
train_users = torch.tensor(X_train[:, 0])
train_items = torch.tensor(X_train[:, 1])
train_ratings = torch.tensor(y_train)

test_users = torch.tensor(X_test[:, 0])
test_items = torch.tensor(X_test[:, 1])
test_ratings = torch.tensor(y_test)

# Model
model = RecommenderNet(num_users=len(user2idx), num_items=len(basket2idx))
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)

# Create dataset and dataloader
batch_size = 128  # or any other value you want

train_dataset = TensorDataset(train_users, train_items, train_ratings)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_users, test_items, test_ratings)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    num_batches = 0
    
    for batch_users, batch_items, batch_ratings in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_users, batch_items)
        loss = criterion(outputs, batch_ratings)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    avg_train_loss = total_loss / num_batches
    
    # Validation
    model.eval()
    val_loss = 0
    val_batches = 0
    
    with torch.no_grad():
        for batch_users, batch_items, batch_ratings in test_loader:
            val_preds = model(batch_users, batch_items)
            val_loss += criterion(val_preds, batch_ratings).item()
            val_batches += 1
    
    avg_val_loss = val_loss / val_batches
    
    print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

Epoch 1: Train Loss = 0.0476, Val Loss = 0.0257
Epoch 2: Train Loss = 0.0248, Val Loss = 0.0244
Epoch 3: Train Loss = 0.0238, Val Loss = 0.0238
Epoch 4: Train Loss = 0.0232, Val Loss = 0.0234
Epoch 5: Train Loss = 0.0227, Val Loss = 0.0231
Epoch 6: Train Loss = 0.0224, Val Loss = 0.0230
Epoch 7: Train Loss = 0.0222, Val Loss = 0.0230
Epoch 8: Train Loss = 0.0221, Val Loss = 0.0230
Epoch 9: Train Loss = 0.0219, Val Loss = 0.0230
Epoch 10: Train Loss = 0.0218, Val Loss = 0.0230


In [90]:
train_data

Unnamed: 0,user_id,basket_name,binary_rating,user_idx,basket_idx
0,1001,a123456789b123456789c123456789,0,0,0
1,1001,Air related companies world,0,0,1
2,1001,Aluminium,0,0,2
3,1001,Aussi Fin,0,0,3
4,1001,Aussi Fin extra,0,0,4
...,...,...,...,...,...
157595,1800,World Software companies,0,799,192
157596,1800,World Software companies I,0,799,193
157597,1800,Worst investment,0,799,194
157598,1800,WTF…,0,799,195


In [9]:
def recommend_for_users(user_ids, top_n=5):
    model.eval()
    all_baskets = list(basket2idx.keys())
    
    for user_id in user_ids:
        user_idx = user2idx[user_id]
        
        already_invested = investments[investments['user_id'] == user_id]['basket_name'].unique()
        unseen_baskets = [b for b in all_baskets if b not in already_invested]
        basket_indices = torch.tensor([basket2idx[b] for b in unseen_baskets])
        user_tensor = torch.tensor([user_idx] * len(basket_indices))

        with torch.no_grad():
            preds = model(user_tensor, basket_indices)
        
        top_indices = torch.topk(preds, top_n).indices
        top_baskets = [(unseen_baskets[i], preds[i].item()) for i in top_indices]

        print(f"\n🔍 Top-{top_n} Recommendations for User {user_id}:")
        for basket, score in top_baskets:
            print(f"→ {basket} (score: {score:.2f})")


In [10]:
random_user_ids = np.random.choice(list(user2idx.keys()), size=10, replace=False)
recommend_for_users([1001, 1002, 1003, 1004, 1005])


🔍 Top-5 Recommendations for User 1001:
→ Healtcare Europe top valuers (score: 0.05)
→ Nordic Green Energy (score: 0.03)
→ Growth Rockets (score: 0.03)
→ Global electric utilities (score: 0.03)
→ Casino fun (score: 0.03)

🔍 Top-5 Recommendations for User 1002:
→ Growth Rockets (score: 0.14)
→ Food producents Finland (score: 0.14)
→ Technology stars of value (score: 0.13)
→ Casino fun (score: 0.13)
→ Nordic Green Energy (score: 0.12)

🔍 Top-5 Recommendations for User 1003:
→ Growth Rockets (score: 0.12)
→ Well performed companies (score: 0.08)
→ Healtcare Europe top valuers (score: 0.08)
→ Financial East (score: 0.08)
→ Swedish climbers (score: 0.07)

🔍 Top-5 Recommendations for User 1004:
→ Basic materials World (score: 0.19)
→ Global utilities (score: 0.16)
→ Oceania valuecreators (score: 0.12)
→ Healtcare Europe top valuers (score: 0.12)
→ Techs going upward (score: 0.11)

🔍 Top-5 Recommendations for User 1005:
→ Global utilities (score: 0.22)
→ Growth Rockets (score: 0.22)
→ Basic m

In [84]:
def evaluate_recommendations(user_ids=None, top_n_values=[2, 5, 10]):
    """
    Evaluate the model using precision, recall, and F1 score
    for multiple values of top_n recommendations
    """
    model.eval()
    all_baskets = list(basket2idx.keys())
    
    # Use all test users if no specific users are provided
    if user_ids is None:
        user_ids = test_data['user_id'].unique()
    
    # Dictionaries to store metrics
    precision_at_k = collections.defaultdict(list)
    recall_at_k = collections.defaultdict(list)
    f1_at_k = collections.defaultdict(list)
    
    for user_id in user_ids:
        # Skip if user is not in test set
        if user_id not in test_data['user_id'].unique():
            continue
            
        # Get user index
        user_idx = user2idx[user_id]
        
        # Get baskets from training set (to exclude)
        if user_id in train_data['user_id'].unique():
            train_baskets = train_data[train_data['user_id'] == user_id]['basket_name'].unique()
        else:
            train_baskets = []
            
        # Get ground truth baskets from test set
        test_baskets = test_data[(test_data['user_id'] == user_id) & (test_data['binary_rating'] == 1)]['basket_name'].unique() 
        
        # Skip if no test baskets
        if len(test_baskets) == 0:
            continue
            
        # Get unseen baskets (all baskets minus training baskets)
        basket_indices = torch.tensor([basket2idx[b]for b in all_baskets])
        user_tensor = torch.tensor([user_idx] * len(basket_indices))
        
        # Get predictions
        with torch.no_grad():
            preds = model(user_tensor, basket_indices)
        
        # Evaluate for different k values
        for k in top_n_values:
                
            # Get top-k recommendations
            top_indices = torch.topk(preds, k).indices.numpy()
            recommended_baskets = [all_baskets[i] for i in top_indices]
            
            # Calculate true positives (recommendations that are in test set)
            true_positives = len(set(recommended_baskets) & set(test_baskets))
            
            # Calculate metrics
            precision = true_positives / k
            recall = true_positives / len(test_baskets)
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
            
            # Store metrics
            precision_at_k[k].append(precision)
            recall_at_k[k].append(recall)
            f1_at_k[k].append(f1)
    
    # Calculate average metrics
    print("\n----- EVALUATION RESULTS -----")
    for k in top_n_values:
        if not precision_at_k[k]:
            print(f"No data for k={k}")
            continue
            
        avg_precision = np.mean(precision_at_k[k])
        avg_recall = np.mean(recall_at_k[k])
        avg_f1 = np.mean(f1_at_k[k])
        
        print(f"\nMetrics at k={k}:")
        print(f"Precision@{k}: {avg_precision:.4f}")
        print(f"Recall@{k}: {avg_recall:.4f}")
        print(f"F1@{k}: {avg_f1:.4f}")
    
    return {
        'precision': precision_at_k,
        'recall': recall_at_k,
        'f1': f1_at_k
    }

# You can call this function to evaluate all test users
metrics = evaluate_recommendations(top_n_values=[2, 5, 10])


----- EVALUATION RESULTS -----

Metrics at k=2:
Precision@2: 0.3850
Recall@2: 0.1485
F1@2: 0.2123

Metrics at k=5:
Precision@5: 0.2930
Recall@5: 0.2796
F1@5: 0.2820

Metrics at k=10:
Precision@10: 0.2320
Recall@10: 0.4388
F1@10: 0.2996


# Grid Search

In [80]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
from torch.utils.data import TensorDataset, DataLoader
import warnings
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# First, create a flexible neural network that can accept different architectures
class FlexibleRecommenderNet(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=32, hidden_layers=[64, 32, 16], dropout_rate=0.0):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        
        # Build flexible architecture based on hidden_layers parameter
        layers = []
        input_size = embedding_size * 2
        
        for hidden_size in hidden_layers:
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.ReLU())
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            input_size = hidden_size
        
        layers.append(nn.Linear(input_size, 1))
        layers.append(nn.Sigmoid())
        
        self.fc = nn.Sequential(*layers)
    
    def forward(self, user_ids, item_ids):
        user_vecs = self.user_embedding(user_ids)
        item_vecs = self.item_embedding(item_ids)
        combined = torch.cat([user_vecs, item_vecs], dim=1)
        return self.fc(combined).squeeze()

# Create a scikit-learn compatible wrapper with progress output
class PyTorchRecommender(BaseEstimator, ClassifierMixin):
    def __init__(self, num_users, num_items, embedding_size=32, hidden_layers=[64, 32, 16], 
                 learning_rate=0.01, batch_size=1024, epochs=20, dropout_rate=0.0,
                 optimizer_name='AdamW', weight_decay=0.01, verbose=True):
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_size = embedding_size
        self.hidden_layers = hidden_layers
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.dropout_rate = dropout_rate
        self.optimizer_name = optimizer_name
        self.weight_decay = weight_decay
        self.verbose = verbose
        
    def fit(self, X, y):
        if self.verbose:
            print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Starting training with parameters:")
            print(f"  - Embedding size: {self.embedding_size}")
            print(f"  - Hidden layers: {self.hidden_layers}")
            print(f"  - Learning rate: {self.learning_rate}")
            print(f"  - Batch size: {self.batch_size}")
            print(f"  - Epochs: {self.epochs}")
            print(f"  - Dropout rate: {self.dropout_rate}")
            print(f"  - Optimizer: {self.optimizer_name}")
            print(f"  - Weight decay: {self.weight_decay}")
            print(f"  - Training samples: {len(X)}")
        
        # Convert to PyTorch tensors
        X_tensor = torch.tensor(X, dtype=torch.long)
        y_tensor = torch.tensor(y, dtype=torch.float32)
        
        # Create data loader
        dataset = TensorDataset(X_tensor[:, 0], X_tensor[:, 1], y_tensor)
        train_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        # Initialize model
        self.model_ = FlexibleRecommenderNet(
            self.num_users, 
            self.num_items, 
            self.embedding_size, 
            self.hidden_layers,
            self.dropout_rate
        )
        
        # Choose optimizer
        if self.optimizer_name == 'AdamW':
            optimizer = optim.AdamW(self.model_.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
        elif self.optimizer_name == 'Adam':
            optimizer = optim.Adam(self.model_.parameters(), lr=self.learning_rate)
        elif self.optimizer_name == 'SGD':
            optimizer = optim.SGD(self.model_.parameters(), lr=self.learning_rate, momentum=0.9)
        
        criterion = nn.MSELoss()
        
        # Training loop with progress output
        self.model_.train()
        for epoch in range(self.epochs):
            epoch_start_time = time.time()
            total_loss = 0
            num_batches = len(train_loader)
            
            for batch_idx, (user_batch, item_batch, rating_batch) in enumerate(train_loader):
                optimizer.zero_grad()
                outputs = self.model_(user_batch, item_batch)
                loss = criterion(outputs, rating_batch)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                
                # Progress output every 10 batches or last batch
                if self.verbose and (batch_idx % 10 == 0 or batch_idx == num_batches - 1):
                    print(f"  Epoch {epoch+1}/{self.epochs}, Batch {batch_idx+1}/{num_batches}, "
                          f"Loss: {loss.item():.6f}", end='\r')
            
            epoch_time = time.time() - epoch_start_time
            avg_loss = total_loss / num_batches
            
            if self.verbose:
                print(f"  Epoch {epoch+1}/{self.epochs} completed in {epoch_time:.2f}s, "
                      f"Avg Loss: {avg_loss:.6f}")
        
        if self.verbose:
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Training completed!\n")
        
        return self
    
    def predict(self, X):
        self.model_.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.long)
            outputs = self.model_(X_tensor[:, 0], X_tensor[:, 1])
            return (outputs > 0.5).numpy().astype(int)
    
    def predict_proba(self, X):
        self.model_.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.long)
            outputs = self.model_(X_tensor[:, 0], X_tensor[:, 1]).numpy()
            # Return probabilities for both classes
            return np.vstack([1 - outputs, outputs]).T
    
    def score(self, X, y):
        # Return negative MSE as score (sklearn expects higher is better)
        self.model_.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.long)
            y_tensor = torch.tensor(y, dtype=torch.float32)
            outputs = self.model_(X_tensor[:, 0], X_tensor[:, 1])
            mse = nn.MSELoss()(outputs, y_tensor).item()
            return -mse  # Negative MSE so higher is better

# Calculate total number of parameter combinations
def calculate_total_combinations(param_grid):
    total = 1
    for values in param_grid.values():
        total *= len(values)
    return total

# Define parameter grid for hyperparameter search
param_grid = {
    'embedding_size': [32, 64],
    'hidden_layers': [
        [64, 32],           # 2 layers
        [128, 64, 32],      # 3 layers  
    ],
    'learning_rate': [0.0001],
    'batch_size': [512, 1024],
    'epochs': [10, 15],
    'dropout_rate': [0.0, 0.1],
    'optimizer_name': ['AdamW'],
}

total_combinations = calculate_total_combinations(param_grid)
print(f"Total parameter combinations to test: {total_combinations}")
print(f"With 3-fold CV, total models to train: {total_combinations * 3}")
print(f"Estimated time: {total_combinations * 3 * 0.5:.0f}-{total_combinations * 3 * 2:.0f} minutes")
print("\nStarting grid search...")
print("=" * 50)

# Initialize the model wrapper
base_model = PyTorchRecommender(
    num_users=len(user2idx),
    num_items=len(basket2idx),
    verbose=True  # Set to False to reduce output clutter during grid search
)

# Create GridSearchCV object with more verbose output
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Use negative MSE
    cv=3,  # 3-fold cross-validation
    n_jobs=1,  # Use all available cores
    verbose=3,  # Increased verbosity
    refit=True,
    return_train_score=True
)

# Track grid search progress
start_time = time.time()
print(f"Grid search started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Fit grid search
grid_search.fit(X_train, y_train)

# Calculate total time
total_time = time.time() - start_time
print(f"\nGrid search completed in {total_time/60:.2f} minutes")
print("=" * 50)

# Print best parameters
print("\nBest parameters found:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest cross-validation score: {-grid_search.best_score_:.6f}")

# Show top 5 parameter combinations
print("\nTop 5 parameter combinations:")
results_df = pd.DataFrame(grid_search.cv_results_)
top_5_indices = results_df['mean_test_score'].nlargest(5).index
for i, idx in enumerate(top_5_indices):
    print(f"\n{i+1}. Score: {-results_df.loc[idx, 'mean_test_score']:.6f}")
    params = results_df.loc[idx, 'params']
    for param, value in params.items():
        print(f"   {param}: {value}")

# Evaluate on test set
print("\nEvaluating best model on test set...")
test_score = grid_search.score(X_test, y_test)
print(f"Test MSE: {-test_score:.6f}")

# Train final model with verbose output
print("\nRetraining best model with verbose output...")
best_params = grid_search.best_params_.copy()
best_params['verbose'] = True  # Enable verbose output for final training

final_model = PyTorchRecommender(
    num_users=len(user2idx),
    num_items=len(basket2idx),
    **best_params
)

final_model.fit(X_train, y_train)

# Make predictions with progress
print("\nMaking predictions on test set...")
predictions = final_model.predict(X_test)
probabilities = final_model.predict_proba(X_test)

print(f"\nAll tasks completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total runtime: {(time.time() - start_time)/60:.2f} minutes")

Total parameter combinations to test: 32
With 3-fold CV, total models to train: 96
Estimated time: 48-192 minutes

Starting grid search...
Grid search started at: 2025-05-04 17:15:26
Fitting 3 folds for each of 32 candidates, totalling 96 fits

[17:15:26] Starting training with parameters:
  - Embedding size: 32
  - Hidden layers: [64, 32]
  - Learning rate: 0.0001
  - Batch size: 512
  - Epochs: 10
  - Dropout rate: 0.0
  - Optimizer: AdamW
  - Weight decay: 0.01
  - Training samples: 105066
  Epoch 1/10 completed in 0.81s, Avg Loss: 0.190380
  Epoch 2/10 completed in 0.96s, Avg Loss: 0.072547
  Epoch 3/10 completed in 0.78s, Avg Loss: 0.036061
  Epoch 4/10 completed in 0.77s, Avg Loss: 0.028889
  Epoch 5/10 completed in 0.79s, Avg Loss: 0.026887
  Epoch 6/10 completed in 0.87s, Avg Loss: 0.026122
  Epoch 7/10 completed in 0.77s, Avg Loss: 0.025627
  Epoch 8/10 completed in 0.87s, Avg Loss: 0.025291
  Epoch 9/10 completed in 0.87s, Avg Loss: 0.025063
  Epoch 10/10 completed in 0.77s, 

In [83]:
def evaluate_recommendations(model, user_ids=None, top_n_values=[2, 5, 10]):
    """
    Evaluate the model using precision, recall, and F1 score
    for multiple values of top_n recommendations
    """
    model.eval()
    all_baskets = list(basket2idx.keys())
    
    # Use all test users if no specific users are provided
    if user_ids is None:
        user_ids = test_data['user_id'].unique()
    
    # Dictionaries to store metrics
    precision_at_k = collections.defaultdict(list)
    recall_at_k = collections.defaultdict(list)
    f1_at_k = collections.defaultdict(list)
    
    for user_id in user_ids:
        # Skip if user is not in test set
        if user_id not in test_data['user_id'].unique():
            continue
            
        # Get user index
        user_idx = user2idx[user_id]
        
        # Get baskets from training set (to exclude)
        if user_id in train_data['user_id'].unique():
            train_baskets = train_data[train_data['user_id'] == user_id]['basket_name'].unique()
        else:
            train_baskets = []
            
        # Get ground truth baskets from test set
        test_baskets = test_data[(test_data['user_id'] == user_id) & (test_data['binary_rating'] == 1)]['basket_name'].unique() 
        
        # Skip if no test baskets
        if len(test_baskets) == 0:
            continue
            
        # Get unseen baskets (all baskets minus training baskets)
        basket_indices = torch.tensor([basket2idx[b] for b in all_baskets])
        user_tensor = torch.tensor([user_idx] * len(basket_indices))
        
        # Get predictions
        with torch.no_grad():
            preds = model(user_tensor, basket_indices)
        
        # Evaluate for different k values
        for k in top_n_values:
            # Get top-k recommendations
            top_indices = torch.topk(preds, k).indices.numpy()
            recommended_baskets = [all_baskets[i] for i in top_indices]
            
            # Calculate true positives (recommendations that are in test set)
            true_positives = len(set(recommended_baskets) & set(test_baskets))
            
            # Calculate metrics
            precision = true_positives / k
            recall = true_positives / len(test_baskets)
            f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
            
            # Store metrics
            precision_at_k[k].append(precision)
            recall_at_k[k].append(recall)
            f1_at_k[k].append(f1)
    
    # Calculate average metrics
    print("\n----- EVALUATION RESULTS -----")
    for k in top_n_values:
        if not precision_at_k[k]:
            print(f"No data for k={k}")
            continue
            
        avg_precision = np.mean(precision_at_k[k])
        avg_recall = np.mean(recall_at_k[k])
        avg_f1 = np.mean(f1_at_k[k])
        
        print(f"\nMetrics at k={k}:")
        print(f"Precision@{k}: {avg_precision:.4f}")
        print(f"Recall@{k}: {avg_recall:.4f}")
        print(f"F1@{k}: {avg_f1:.4f}")
    
    return {
        'precision': precision_at_k,
        'recall': recall_at_k,
        'f1': f1_at_k
    }

# After grid search is complete, get the best model
best_estimator = grid_search.best_estimator_
best_pytorch_model = best_estimator.model_  # Access the PyTorch model from the sklearn wrapper

# Evaluate using the best model
print("\nEvaluating best model from grid search...")
metrics = evaluate_recommendations(best_pytorch_model, top_n_values=[2, 5, 10])

# Or alternatively, you can create a wrapper function
def evaluate_best_model(grid_search_results, user_ids=None, top_n_values=[2, 5, 10]):
    """
    Evaluate the best model from grid search
    """
    # Get the best PyTorch model from the grid search
    best_estimator = grid_search_results.best_estimator_
    best_pytorch_model = best_estimator.model_
    
    # Evaluate
    return evaluate_recommendations(best_pytorch_model, user_ids, top_n_values)

# Use it like this:
metrics = evaluate_best_model(grid_search, top_n_values=[2, 5, 10])


Evaluating best model from grid search...

----- EVALUATION RESULTS -----

Metrics at k=2:
Precision@2: 0.3875
Recall@2: 0.1502
F1@2: 0.2143

Metrics at k=5:
Precision@5: 0.2900
Recall@5: 0.2789
F1@5: 0.2803

Metrics at k=10:
Precision@10: 0.2360
Recall@10: 0.4451
F1@10: 0.3044

----- EVALUATION RESULTS -----

Metrics at k=2:
Precision@2: 0.3875
Recall@2: 0.1502
F1@2: 0.2143

Metrics at k=5:
Precision@5: 0.2900
Recall@5: 0.2789
F1@5: 0.2803

Metrics at k=10:
Precision@10: 0.2360
Recall@10: 0.4451
F1@10: 0.3044


In [89]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class HybridRecommenderNet(nn.Module):
    def __init__(self, num_users, num_items, num_content_features, 
                 embedding_size=32, hidden_layers=[128, 64, 32], dropout_rate=0.2):
        super().__init__()
        
        # User and item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        
        # Content feature processing - reduce from 148 features to embedding_size
        self.content_fc = nn.Sequential(
            nn.Linear(num_content_features, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, embedding_size)
        )
        
        # Combined feature size: user + item + content embeddings
        input_size = embedding_size * 3
        
        # Build the main network
        layers = []
        for hidden_size in hidden_layers:
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            input_size = hidden_size
        
        layers.append(nn.Linear(input_size, 1))
        layers.append(nn.Sigmoid())
        
        self.fc = nn.Sequential(*layers)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, std=0.1)
    
    def forward(self, user_ids, item_ids, content_features):
        user_vecs = self.user_embedding(user_ids)
        item_vecs = self.item_embedding(item_ids)
        content_vecs = self.content_fc(content_features)
        
        combined = torch.cat([user_vecs, item_vecs, content_vecs], dim=1)
        return self.fc(combined).squeeze()

class HybridDataset(Dataset):
    def __init__(self, user_ids, item_ids, content_features, labels):
        self.user_ids = torch.tensor(user_ids, dtype=torch.long)
        self.item_ids = torch.tensor(item_ids, dtype=torch.long)
        self.content_features = torch.tensor(content_features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (self.user_ids[idx], 
                self.item_ids[idx], 
                self.content_features[idx], 
                self.labels[idx])

def prepare_hybrid_data(investments_df, basket_features_df):
    """
    Prepare data for the hybrid model
    """
    # Create user and basket index mappings
    user2idx = {user: idx for idx, user in enumerate(investments_df['user_id'].unique())}
    basket2idx = {basket: idx for idx, basket in enumerate(basket_features_df['basket_name'].unique())}
    
    # Create feature matrix for baskets (excluding basket_name column)
    feature_cols = basket_features_df.columns[1:]  # Skip basket_name
    basket_feature_matrix = basket_features_df[feature_cols].values
    
    # Standardize features
    scaler = StandardScaler()
    basket_feature_matrix = scaler.fit_transform(basket_feature_matrix)
    
    # Create basket name to feature vector mapping
    basket_to_features = {}
    for idx, basket_name in enumerate(basket_features_df['basket_name']):
        basket_to_features[basket_name] = basket_feature_matrix[idx]
    
    # Prepare training data
    X_users = []
    X_items = []
    X_content = []
    y = []
    
    # Add positive examples (user-basket pairs from investments)
    for _, row in investments_df.iterrows():
        if row['basket_name'] in basket2idx:  # Only if basket exists in features
            user_idx = user2idx[row['user_id']]
            basket_idx = basket2idx[row['basket_name']]
            content_features = basket_to_features[row['basket_name']]
            
            X_users.append(user_idx)
            X_items.append(basket_idx)
            X_content.append(content_features)
            y.append(1)  # Positive example
    
    # Add negative examples
    all_baskets = list(basket2idx.keys())
    for _, row in investments_df.iterrows():
        if row['basket_name'] in basket2idx:
            user_idx = user2idx[row['user_id']]
            
            # Sample negative baskets for this user
            invested_baskets = investments_df[investments_df['user_id'] == row['user_id']]['basket_name'].tolist()
            eligible_baskets = [b for b in all_baskets if b not in invested_baskets]
            
            if len(eligible_baskets) > 0:
                # Sample 3 negative examples for each positive
                neg_samples = np.random.choice(eligible_baskets, size=min(3, len(eligible_baskets)), replace=False)
                
                for neg_basket in neg_samples:
                    neg_idx = basket2idx[neg_basket]
                    content_features = basket_to_features[neg_basket]
                    
                    X_users.append(user_idx)
                    X_items.append(neg_idx)
                    X_content.append(content_features)
                    y.append(0)  # Negative example
    
    return (np.array(X_users), np.array(X_items), np.array(X_content), np.array(y), 
            user2idx, basket2idx, basket_to_features, scaler)

def train_hybrid_model(model, train_loader, val_loader, epochs=10, lr=0.0005):
    """
    Train the hybrid model
    """
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            user_ids, item_ids, content_features, labels = batch
            
            optimizer.zero_grad()
            outputs = model(user_ids, item_ids, content_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                user_ids, item_ids, content_features, labels = batch
                outputs = model(user_ids, item_ids, content_features)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        
        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    return train_losses, val_losses

class HybridRecommender:
    def __init__(self, model, user2idx, basket2idx, basket_to_features, scaler):
        self.model = model
        self.user2idx = user2idx
        self.basket2idx = basket2idx
        self.idx2basket = {v: k for k, v in basket2idx.items()}
        self.basket_to_features = basket_to_features
        self.scaler = scaler
    
    def recommend(self, user_id, investments_df, n_recommendations=10):
        """
        Generate recommendations for a user
        """
        self.model.eval()
        
        # Handle new users
        if user_id not in self.user2idx:
            return self.recommend_for_new_user(n_recommendations)
        
        user_idx = self.user2idx[user_id]
        
        # Get user's existing investments
        user_investments = investments_df[investments_df['user_id'] == user_id]['basket_name'].values
        
        # Get all baskets that user hasn't invested in
        all_baskets = list(self.basket2idx.keys())
        candidate_baskets = [b for b in all_baskets if b not in user_investments]
        
        if not candidate_baskets:
            return []
        
        # Prepare data for prediction
        user_ids = np.array([user_idx] * len(candidate_baskets))
        item_ids = np.array([self.basket2idx[b] for b in candidate_baskets])
        content_features = np.array([self.basket_to_features[b] for b in candidate_baskets])
        
        # Convert to tensors
        user_ids_tensor = torch.tensor(user_ids, dtype=torch.long)
        item_ids_tensor = torch.tensor(item_ids, dtype=torch.long)
        content_features_tensor = torch.tensor(content_features, dtype=torch.float32)
        
        # Get predictions
        with torch.no_grad():
            scores = self.model(user_ids_tensor, item_ids_tensor, content_features_tensor).numpy()
        
        # Create recommendations with scores
        recommendations = []
        for i, basket in enumerate(candidate_baskets):
            recommendations.append({
                'basket': basket,
                'score': float(scores[i]),
                'features': self._get_top_features(basket)
            })
        
        # Sort by score
        recommendations.sort(key=lambda x: x['score'], reverse=True)
        
        return recommendations[:n_recommendations]
    
    def recommend_for_new_user(self, n_recommendations=10):
        """
        Recommend baskets for new users based on content popularity
        """
        # Simple popularity-based approach for cold start
        popular_baskets = []
        
        # You could implement a more sophisticated approach here
        # For example, diversify based on different feature categories
        
        # Get baskets with diverse characteristics
        baskets_by_volatility = {}
        for basket, features in self.basket_to_features.items():
            # Get volatility features (last 3 columns in your case)
            volatility_features = features[-3:]
            volatility_type = np.argmax(volatility_features)
            
            if volatility_type not in baskets_by_volatility:
                baskets_by_volatility[volatility_type] = []
            baskets_by_volatility[volatility_type].append(basket)
        
        # Sample from each volatility type
        recommendations = []
        for volatility_type, baskets in baskets_by_volatility.items():
            sample_size = min(n_recommendations // 3, len(baskets))
            sampled = np.random.choice(baskets, size=sample_size, replace=False)
            for basket in sampled:
                recommendations.append({
                    'basket': basket,
                    'score': 0.5,  # Default score for new users
                    'features': self._get_top_features(basket)
                })
        
        return recommendations[:n_recommendations]
    
    def _get_top_features(self, basket_name):
        """
        Get the top features for a basket for interpretability
        """
        features = self.basket_to_features[basket_name]
        
        # Get the original feature names from basket_features columns
        feature_names = ['industry_name_Advanced Medical Equipment & Technology', 
                        'industry_name_Advertising & Marketing', ...]  # Use actual column names
        
        # Find top 5 features
        top_indices = np.argsort(np.abs(features))[-5:]
        
        top_features = []
        for idx in top_indices:
            if idx < len(feature_names):
                top_features.append({
                    'feature': feature_names[idx],
                    'value': float(features[idx])
                })
        
        return top_features

# Main execution
if __name__ == "__main__":
    # Load data
    investments_df = pd.read_csv('syntheticDataGenerators/investment/invest_data.csv', sep=';')    
    basket_features_df = pd.read_csv('basket_features.csv')
    
    # Prepare data
    (X_users, X_items, X_content, y, 
     user2idx, basket2idx, basket_to_features, scaler) = prepare_hybrid_data(investments_df, basket_features_df)
    
    # Split data
    train_indices, val_indices = train_test_split(range(len(y)), test_size=0.2, random_state=42)
    
    # Create datasets
    train_dataset = HybridDataset(X_users[train_indices], X_items[train_indices], 
                                  X_content[train_indices], y[train_indices])
    val_dataset = HybridDataset(X_users[val_indices], X_items[val_indices], 
                                X_content[val_indices], y[val_indices])
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    
    # Initialize model
    num_users = len(user2idx)
    num_items = len(basket2idx)
    num_content_features = basket_features_df.shape[1] - 1  # Exclude basket_name column
    
    model = HybridRecommenderNet(
        num_users=num_users,
        num_items=num_items,
        num_content_features=num_content_features,
        embedding_size=32,
        hidden_layers=[128, 64, 32],
        dropout_rate=0.2
    )
    
    # Train model
    train_losses, val_losses = train_hybrid_model(model, train_loader, val_loader, epochs=10)
    
    # Create recommender
    recommender = HybridRecommender(model, user2idx, basket2idx, basket_to_features, scaler)
    
    # Example usage
    user_id = 'user_123'
    recommendations = recommender.recommend(user_id, investments_df, n_recommendations=10)
    
    print(f"\nRecommendations for user {user_id}:")
    for i, rec in enumerate(recommendations):
        print(f"{i+1}. {rec['basket']} (Score: {rec['score']:.3f})")
        print("   Top features:")
        for feat in rec['features']:
            print(f"   - {feat['feature']}: {feat['value']:.3f}")

Epoch [5/10], Train Loss: 0.3181, Val Loss: 0.3044
Epoch [10/10], Train Loss: 0.2721, Val Loss: 0.3509

Recommendations for user user_123:
1. German and French companies (Score: 0.500)
   Top features:
2. Techs seeking value upgrade (Score: 0.500)
   Top features:
3. Most traded stocks (Score: 0.500)
   Top features:
4. Spanish companies (Score: 0.500)
   Top features:
5. medical equipment medium (Score: 0.500)
   Top features:
6. Software in Germany (Score: 0.500)
   Top features:
7. Financial World  nu funds (Score: 0.500)
   Top features:
8. Pers Choice Europé (Score: 0.500)
   Top features:
9. Swedish climbers (Score: 0.500)
   Top features:
