In [1]:
import pandas as pd

In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import collections

# Step 1: Load and prepare the data
basket_features = pd.read_csv("../basket_features.csv")
train_data = pd.read_csv('y_train.csv')
test_data = pd.read_csv('y_test.csv')

# Step 2: Convert basket_features to a feature matrix
# Set the basket_name as index for easier access
basket_features_indexed = basket_features.set_index('basket_name')

# Step 3: Build user profiles based on baskets they've purchased
def build_user_profile(user_id, user_baskets, basket_features_df):
    """
    Create a user profile vector by aggregating features of baskets they've purchased
    """
    # Filter baskets for this user
    user_basket_list = user_baskets[user_baskets['user_id'] == user_id]['basket_name'].tolist()
    
    # Get feature vectors for all baskets purchased by the user
    # Skip any baskets not found in the features dataframe
    user_basket_features = [
        basket_features_df.loc[basket] for basket in user_basket_list 
        if basket in basket_features_df.index
    ]
    
    # If user has no valid baskets, return a zero vector
    if not user_basket_features:
        return pd.Series(0, index=basket_features_df.columns)
    
    # Combine all basket features to create user profile (using mean)
    user_profile = pd.concat(user_basket_features, axis=1).mean(axis=1)
    return user_profile

# Step 4: Create a user-feature matrix for all users in the training set
unique_users = train_data['user_id'].unique()
user_profiles = {}

# Build profile for each user
for user_id in unique_users:
    user_profiles[user_id] = build_user_profile(user_id, train_data, basket_features_indexed)

# Convert to DataFrame for easier manipulation
user_profiles_df = pd.DataFrame(user_profiles).T

# Function to predict ratings for a user-basket pair based on cosine similarity
def predict_rating(user_id, basket_name, user_profiles_df, basket_features_df):
    """
    Predict the rating (similarity score) for a user-basket pair
    """
    # Get user profile
    if user_id not in user_profiles_df.index:
        return 0  # User not found
    
    user_profile = user_profiles_df.loc[user_id]
    
    # Get basket features
    if basket_name not in basket_features_df.index:
        return 0  # Basket not found
    
    basket_features = basket_features_df.loc[basket_name]
    
    # Calculate cosine similarity
    similarity = cosine_similarity(
        [user_profile.values], 
        [basket_features.values]
    )[0][0]
    
    return similarity

# Create a prediction object similar to what model_SVD.predict would return
class Prediction:
    def __init__(self, uid, iid, est):
        self.uid = uid
        self.iid = iid
        self.est = est

# Get all unique baskets for prediction
unique_baskets = basket_features.basket_name.unique()

# Dictionary to store precision, recall, and F1 values
precision_at_k = collections.defaultdict(list)
recall_at_k = collections.defaultdict(list)
f1_at_k = collections.defaultdict(list)

# Get all unique test users
test_user_ids = test_data['user_id'].unique()

# For each user in the test set
for user_id in test_user_ids:
    # Find baskets this user has invested in from test data (ground truth)
    user_positive_test_baskets = set(test_data[test_data['user_id'] == user_id]['basket_name'])
    
    # If no positive test baskets, skip this user
    if len(user_positive_test_baskets) == 0:
        continue
    
    # Find baskets the user has already invested in from train data
    user_invested_train_baskets = set(train_data[train_data['user_id'] == user_id]['basket_name'])
    
    # Baskets to predict (all baskets minus those already invested in from training)
    baskets_to_predict = [b for b in unique_baskets if b not in user_invested_train_baskets]
    
    # Make predictions for all candidate baskets
    user_predictions = [
        Prediction(user_id, basket, predict_rating(user_id, basket, user_profiles_df, basket_features_indexed)) 
        for basket in baskets_to_predict
    ]
    sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)
    
    # Calculate precision and recall at different k values
    for k in [1, 2, 3]:
        # Ensure k doesn't exceed number of predictions
        effective_k = min(k, len(sorted_predictions))
        
        # Skip if no predictions
        if effective_k == 0:
            continue
        
        # Get top-k recommended baskets
        top_k_recs = [pred.iid for pred in sorted_predictions[:effective_k]]
        
        # Calculate relevant items among top-k recommendations (positive baskets in test set)
        true_positives = len(set(top_k_recs) & user_positive_test_baskets)
        
        # Precision = relevant recommended / all recommended
        precision = true_positives / effective_k
        
        # Recall = relevant recommended / all relevant
        recall = true_positives / len(user_positive_test_baskets)
        
        # F1 score = 2 * (precision * recall) / (precision + recall)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_at_k[k].append(precision)
        recall_at_k[k].append(recall)
        f1_at_k[k].append(f1)

# Calculate average precision, recall, and F1 for each k
print("\nEvaluation Metrics for Top-K Recommendations:")
for k in [1, 2, 3]:
    avg_precision = np.mean(precision_at_k[k]) if precision_at_k[k] else 0
    avg_recall = np.mean(recall_at_k[k]) if recall_at_k[k] else 0
    avg_f1 = np.mean(f1_at_k[k]) if f1_at_k[k] else 0
    
    print(f"\nMetrics for k={k}:")
    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")
    print(f"F1@{k}: {avg_f1:.4f}")
    
    print(f"Number of users evaluated: {len(precision_at_k[k])}")


Evaluation Metrics for Top-K Recommendations:

Metrics for k=1:
Precision@1: 0.0201
Recall@1: 0.0131
F1@1: 0.0154
Number of users evaluated: 994

Metrics for k=2:
Precision@2: 0.0201
Recall@2: 0.0272
F1@2: 0.0225
Number of users evaluated: 994

Metrics for k=3:
Precision@3: 0.0218
Recall@3: 0.0463
F1@3: 0.0289
Number of users evaluated: 994
