In [None]:
# --- Setup: Load saved model and rebuild components ---
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import collections
import torch.nn as nn

# Load saved model checkpoint with allowed globals (due to LabelEncoder)
with torch.serialization.safe_globals([LabelEncoder]):
    checkpoint = torch.load('best_recommendation_model.pth', weights_only=False)

# Extract saved encoders, best model parameters, and basket encoder
basket_encoder = checkpoint['basket_encoder']
encoders = checkpoint['encoders']
best_params = checkpoint['best_params']

# Rebuild the MLP model with saved architecture
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(MLP, self).__init__()
        layers = []
        layers.append(nn.Linear(input_size, hidden_sizes[0]))
        layers.append(nn.ReLU())
        for i in range(1, len(hidden_sizes)):
            layers.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Instantiate and load model weights
input_size = len(encoders)
hidden_sizes = best_params['hidden_sizes']
output_size = len(basket_encoder.classes_)
final_model = MLP(input_size, hidden_sizes, output_size)
final_model.load_state_dict(checkpoint['model_state_dict'])
final_model.eval()

# --- Load investment and basket feature data ---
train_data = pd.read_csv('y_train.csv')
test_data = pd.read_csv('y_test.csv')
basket_features = pd.read_csv('../basket_features.csv')
basket_features_indexed = basket_features.set_index('basket_name')
unique_baskets = basket_features['basket_name'].unique()

# --- Build user content profiles by averaging invested basket features ---
def build_user_profile(user_id, user_baskets, basket_features_df):
    user_basket_list = user_baskets[user_baskets['user_id'] == user_id]['basket_name'].tolist()
    user_basket_features = [
        basket_features_df.loc[basket] for basket in user_basket_list 
        if basket in basket_features_df.index
    ]
    if not user_basket_features:
        return pd.Series(0, index=basket_features_df.columns)
    return pd.concat(user_basket_features, axis=1).mean(axis=1)

# Precompute user profiles for all training users
user_profiles = {
    user_id: build_user_profile(user_id, train_data, basket_features_indexed)
    for user_id in train_data['user_id'].unique()
}
user_profiles_df = pd.DataFrame(user_profiles).T

# --- Utility Functions ---
def normalize_scores(score_dict):
    # Normalize score dictionary values to [0, 1]
    items = list(score_dict.items())
    keys = [k for k, _ in items]
    values = np.array([v for _, v in items]).reshape(-1, 1)
    if len(values) == 0:
        return {}
    scaler = MinMaxScaler()
    normalized = scaler.fit_transform(values).flatten()
    return dict(zip(keys, normalized))

def predict_rating(user_id, basket_name, user_profiles_df, basket_features_df):
    # Compute cosine similarity between user profile and basket feature vector
    if user_id not in user_profiles_df.index or basket_name not in basket_features_df.index:
        return 0
    user_profile = user_profiles_df.loc[user_id]
    basket_vector = basket_features_df.loc[basket_name]
    sim = np.dot(user_profile, basket_vector) / (np.linalg.norm(user_profile) * np.linalg.norm(basket_vector) + 1e-8)
    return sim

def predict_user_baskets(user_id, model, exclude_baskets=None):
    # Use deep learning model to predict basket probabilities for a given user
    x_train = pd.read_csv('X_train.csv')
    user_row = x_train[x_train['user_id'] == user_id]
    if user_row.empty:
        return []
    feature_vector = [
        encoders[col].transform([user_row[col].values[0]])[0]
        for col in encoders
    ]
    x_tensor = torch.tensor(feature_vector, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        logits = model(x_tensor)
        probs = torch.softmax(logits, dim=1).squeeze(0).numpy()
    basket_probs = [(i, prob) for i, prob in enumerate(probs)]
    if exclude_baskets is not None:
        basket_probs = [
            (i, p) for i, p in basket_probs
            if basket_encoder.inverse_transform([i])[0] not in exclude_baskets
        ]
    return sorted(basket_probs, key=lambda x: x[1], reverse=True)

# --- Hybrid Recommendation Function ---
def get_hybrid_recommendations(
    user_id,
    top_k=5,
    weights=(0.5, 0.5),  # (CB, DL)
    basket_encoder=None,
    basket_features_indexed=None,
    user_profiles_df=None,
    final_model=None,
    encoders=None,
    all_baskets=None,
    train_data=None
):
    # Identify candidate baskets not already purchased
    invested_baskets = set(train_data[train_data['user_id'] == user_id]['basket_name'])
    candidate_baskets = [b for b in all_baskets if b not in invested_baskets]

    # Predict content-based scores
    cb_scores = {
        basket: predict_rating(user_id, basket, user_profiles_df, basket_features_indexed)
        for basket in candidate_baskets
    }
    cb_scores = normalize_scores(cb_scores)

    # Predict deep learning scores
    dl_predictions = predict_user_baskets(user_id, final_model, exclude_baskets=invested_baskets)
    dl_scores = {
        basket_encoder.inverse_transform([idx])[0]: score
        for idx, score in dl_predictions if basket_encoder.inverse_transform([idx])[0] in candidate_baskets
    }
    dl_scores = normalize_scores(dl_scores)

    # Weighted score fusion
    combined_scores = {}
    for basket in candidate_baskets:
        cb = cb_scores.get(basket, 0)
        dl = dl_scores.get(basket, 0)
        combined_scores[basket] = weights[0]*cb + weights[1]*dl

    return sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

# --- Evaluation Function ---
def evaluate_hybrid_model(
    test_user_ids,
    test_data,
    train_data,
    basket_encoder,
    basket_features_indexed,
    user_profiles_df,
    final_model,
    encoders,
    all_baskets,
    top_k_values=[1, 2, 3],
    weights=(0.5, 0.5)
):
    precision_at_k = collections.defaultdict(list)
    recall_at_k = collections.defaultdict(list)
    f1_at_k = collections.defaultdict(list)

    for user_id in test_user_ids:
        true_baskets = set(test_data[test_data['user_id'] == user_id]['basket_name'])
        if not true_baskets:
            continue
        top_k_recs = get_hybrid_recommendations(
            user_id=user_id,
            top_k=max(top_k_values),
            weights=weights,
            basket_encoder=basket_encoder,
            basket_features_indexed=basket_features_indexed,
            user_profiles_df=user_profiles_df,
            final_model=final_model,
            encoders=encoders,
            all_baskets=all_baskets,
            train_data=train_data
        )
        rec_baskets = [basket for basket, _ in top_k_recs]
        for k in top_k_values:
            top_k = rec_baskets[:k]
            tp = len(set(top_k) & true_baskets)
            precision = tp / k
            recall = tp / len(true_baskets)
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
            precision_at_k[k].append(precision)
            recall_at_k[k].append(recall)
            f1_at_k[k].append(f1)

    print("\n Hybrid Recommender Evaluation Metrics (CB + DL):")
    results = {}
    for k in top_k_values:
        avg_p = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
        avg_r = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
        avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
        print(f"\n➡️ Top-{k} metrics:")
        print(f"Precision@{k}: {avg_p:.4f}")
        print(f"Recall@{k}:    {avg_r:.4f}")
        print(f"F1@{k}:        {avg_f1:.4f}")
        print(f"Users evaluated: {len(precision_at_k[k])}")
        results[k] = {"precision": avg_p, "recall": avg_r, "f1": avg_f1}
    return results

# --- Run Hybrid Recommender for a single user ---
top_recs = get_hybrid_recommendations(
    user_id=1001,
    top_k=5,
    weights=(0.3, 0.7),  # More weight to DL
    basket_encoder=basket_encoder,
    basket_features_indexed=basket_features_indexed,
    user_profiles_df=user_profiles_df,
    final_model=final_model,
    encoders=encoders,
    all_baskets=unique_baskets,
    train_data=train_data
)

for basket, score in top_recs:
    print(f"{basket}: {score:.4f}")

# --- Run Evaluation on all users ---
results = evaluate_hybrid_model(
    test_user_ids=test_data['user_id'].unique(),
    test_data=test_data,
    train_data=train_data,
    basket_encoder=basket_encoder,
    basket_features_indexed=basket_features_indexed,
    user_profiles_df=user_profiles_df,
    final_model=final_model,
    encoders=encoders,
    all_baskets=unique_baskets,
    top_k_values=[1, 2, 3],
    weights=(0.3, 0.7)
)


Global healthcare I: 0.8822
Pharmaceuticals EU: 0.7212
Well traded stocks: 0.6855
Healthcare southern Europé: 0.6595
Healthcare Facilities: 0.6401

📊 Hybrid Recommender Evaluation Metrics (CB + DL):

➡️ Top-1 metrics:
Precision@1: 0.1358
Recall@1:    0.0986
F1@1:        0.1110
Users evaluated: 994

➡️ Top-2 metrics:
Precision@2: 0.1162
Recall@2:    0.1670
F1@2:        0.1331
Users evaluated: 994

➡️ Top-3 metrics:
Precision@3: 0.1026
Recall@3:    0.2183
F1@3:        0.1360
Users evaluated: 994


In [None]:
def grid_search_weights(
    test_user_ids,
    test_data,
    train_data,
    basket_encoder,
    basket_features_indexed,
    user_profiles_df,
    final_model,
    encoders,
    all_baskets,
    top_k=3,  # Evaluate at k=3 by default
    step=0.1
):
    best_f1 = 0
    best_weights = None
    results_log = []

    weight_range = np.arange(0.0, 1.01, step)  # [0.0, 0.1, ..., 1.0]

    print("Starting grid search over hybrid weights...\n")

    for cb_weight in weight_range:
        dl_weight = 1.0 - cb_weight
        weights = (cb_weight, dl_weight)

        print(f"Evaluating weights: CB={cb_weight:.1f}, DL={dl_weight:.1f}")

        result = evaluate_hybrid_model(
            test_user_ids=test_user_ids,
            test_data=test_data,
            train_data=train_data,
            basket_encoder=basket_encoder,
            basket_features_indexed=basket_features_indexed,
            user_profiles_df=user_profiles_df,
            final_model=final_model,
            encoders=encoders,
            all_baskets=all_baskets,
            top_k_values=[top_k],
            weights=weights
        )

        f1 = result[top_k]['f1']
        results_log.append((cb_weight, dl_weight, f1))

        if f1 > best_f1:
            best_f1 = f1
            best_weights = weights

    print("\nGrid Search Complete")
    print(f"Best Weights: CB={best_weights[0]:.2f}, DL={best_weights[1]:.2f} → F1@{top_k} = {best_f1:.4f}")

    return best_weights, results_log


In [None]:
best_weights, logs = grid_search_weights(
    test_user_ids=test_data['user_id'].unique(),
    test_data=test_data,
    train_data=train_data,
    basket_encoder=basket_encoder,
    basket_features_indexed=basket_features_indexed,
    user_profiles_df=user_profiles_df,
    final_model=final_model,
    encoders=encoders,
    all_baskets=unique_baskets,
    top_k=1,     # test k=1 ... or k=5
    step=0.1 
)


🔍 Starting grid search over hybrid weights...

Evaluating weights: CB=0.0, DL=1.0

📊 Hybrid Recommender Evaluation Metrics (CB + DL):

➡️ Top-1 metrics:
Precision@1: 0.2243
Recall@1:    0.1665
F1@1:        0.1858
Users evaluated: 994
Evaluating weights: CB=0.1, DL=0.9
