# Collaborative Filtering

In [4]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD  # Import SVD instead of KNNBasic
from surprise.model_selection import KFold as SurpriseKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import time

print("--- Base Model 1: CF (SVD - Model-Based) + Evaluation ---")

# --- Step 0: Load Preprocessed Data ---
filename = 'df_master_preprocessed.csv' 
try:
    df_master = pd.read_csv(filename)
    print(f"'{filename}' loaded successfully. Shape: {df_master.shape}")
except FileNotFoundError:
    print(f"Error: '{filename}' not found. Please run the preprocessing notebook first.")

# --- Step 1: Load data into Surprise format ---
reader = Reader(rating_scale=(0, 5))
print("1. Loading data into Surprise format (this may take a moment)...")
data = Dataset.load_from_df(
    df_master[['username', 'beer_id', 'score']], 
    reader
)

# --- Step 2: Define CF Model (SVD - Model-Based) ---
# 'model-based' CF from the proposal. Memory efficient.
print("2. Defining CF Model (SVD - Model-Based)...")
cf_model = SVD(n_factors=50, n_epochs=20, random_state=42, verbose=False) # Using 50 factors

# --- Step 3: K-Fold Stacking and Performance Evaluation ---
print("3. Starting K-Fold Stacking and evaluation (n_splits=5)...")
start_time = time.time()
all_predictions = []  # To store stacking predictions
fold_metrics = []     # To store performance metrics per fold

kf = SurpriseKFold(n_splits=5, random_state=42)
for i, (trainset, testset) in enumerate(kf.split(data)):
    print(f"Processing Fold {i+1}/5...")
    
    # 3a. Train: SVD uses gradual learning (epochs), so no memory issues
    cf_model.fit(trainset)
    
    # 3b. Predict: Predict on the 'testset' (Hold-out Fold)
    fold_predictions = cf_model.test(testset)
    all_predictions.extend(fold_predictions)
    
    # 3c. Performance Evaluation
    # Create is_like (y_true) and CF predictions (y_pred)
    y_true = []
    y_pred_score = [] # For AUC (used like probability)
    y_pred_label = [] # For F1/Precision/Recall (based on 4.0 threshold)
    
    for pred in fold_predictions:
        y_true.append(1 if pred.r_ui >= 4.0 else 0) # Actual 'is_like'
        y_pred_score.append(pred.est)               # SVD's predicted score
        y_pred_label.append(1 if pred.est >= 4.0 else 0) # SVD's predicted 'is_like'
    
    # Calculate performance metrics as per the proposal
    auc = roc_auc_score(y_true, y_pred_score)
    f1 = f1_score(y_true, y_pred_label)
    precision = precision_score(y_true, y_pred_label)
    recall = recall_score(y_true, y_pred_label)
    
    fold_metrics.append({'Fold': i+1, 'AUC': auc, 'F1': f1, 'Precision': precision, 'Recall': recall})
    print(f"Fold {i+1} - AUC: {auc:.4f}, F1: {f1:.4f}")

print(f"K-Fold CF complete. Time taken: {(time.time() - start_time):.2f} sec")

# --- Step 3.5: Print average performance ---
print("\n--- Base Model 1 (CF - SVD) Performance ---")
metrics_df = pd.DataFrame(fold_metrics)
print(metrics_df.mean(numeric_only=True))

# --- Step 4: Convert predictions to DataFrame (for Stacking) ---
print("\n4. Converting predictions to DataFrame...")
pred_data = {
    'username': [pred.uid for pred in all_predictions],
    'beer_id': [pred.iid for pred in all_predictions],
    'cf_predicted_score': [pred.est for pred in all_predictions]
}
cf_predictions_df = pd.DataFrame(pred_data)

# --- Step 5: Merge CF predictions back into df_master ---
print("5. Merging CF predictions into df_master...")
if 'cf_predicted_score' in df_master.columns:
    df_master = df_master.drop(columns=['cf_predicted_score'])
df_master = df_master.merge(cf_predictions_df, on=['username', 'beer_id'], how='left')

# Save the results.
df_master.to_csv('df_master_with_cf.csv', index=False)

print("\n--- CF Feature Generation Complete ---")
print(df_master[['username', 'beer_id', 'score', 'cf_predicted_score']].head())

--- Base Model 1: CF (SVD - Model-Based) + Evaluation ---
'df_master_preprocessed.csv' loaded successfully. Shape: (8417033, 35)
1. Loading data into Surprise format (this may take a moment)...
2. Defining CF Model (SVD - Model-Based)...
3. Starting K-Fold Stacking and evaluation (n_splits=5)...
Processing Fold 1/5...
Fold 1 - AUC: 0.8573, F1: 0.7530
Processing Fold 2/5...
Fold 2 - AUC: 0.8574, F1: 0.7528
Processing Fold 3/5...
Fold 3 - AUC: 0.8570, F1: 0.7529
Processing Fold 4/5...
Fold 4 - AUC: 0.8573, F1: 0.7536
Processing Fold 5/5...
Fold 5 - AUC: 0.8572, F1: 0.7531
K-Fold CF complete. Time taken: 488.73 sec

--- Base Model 1 (CF - SVD) Performance ---
Fold         3.000000
AUC          0.857248
F1           0.753091
Precision    0.843583
Recall       0.680133
dtype: float64

4. Converting predictions to DataFrame...
5. Merging CF predictions into df_master...

--- CF Feature Generation Complete ---
          username  beer_id  score  cf_predicted_score
0     bluejacket74   271781 