In [None]:
# ============================================================
# Baseline & Correlation-based Collaborative Filtering
# ============================================================

import pandas as pd
import numpy as np
from tqdm import tqdm

# ============================================================
# 1. Load data
# ============================================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train.head())

# ============================================================
# 2. Baseline Methods
# ============================================================

# (a) Global mean predictor
global_mean = train['rating'].mean()
print(f"\nGlobal mean rating: {global_mean:.4f}")

# (b) User mean predictor
user_mean = train.groupby('user_id')['rating'].mean().to_dict()

# (c) Item mean predictor
item_mean = train.groupby('item_id')['rating'].mean().to_dict()

# Predict for test set with fallback priority: user mean → item mean → global mean
def baseline_predict(row):
    u = row['user_id']
    i = row['item_id']
    if u in user_mean:
        return user_mean[u]
    elif i in item_mean:
        return item_mean[i]
    else:
        return global_mean

test['baseline_pred'] = test.apply(baseline_predict, axis=1)

# ============================================================
# 3. Correlation-based Collaborative Filtering (User-based)
# ============================================================

# Build user-item rating matrix
rating_matrix = train.pivot_table(index='user_id', columns='item_id', values='rating')

# Compute Pearson correlation between users
user_corr = rating_matrix.T.corr(min_periods=5)  # require at least 5 overlapping items

def user_based_cf_predict(user, item, k=10):
    if item not in rating_matrix.columns or user not in rating_matrix.index:
        return np.nan

    # Get other users who rated this item
    users_who_rated = rating_matrix[item].dropna().index
    if len(users_who_rated) == 0:
        return np.nan

    # Get correlations between target user and others
    sim_scores = user_corr.loc[user, users_who_rated].dropna()
    if sim_scores.empty:
        return np.nan

    # Take top-k similar users
    top_k_users = sim_scores.nlargest(k)
    ratings = rating_matrix.loc[top_k_users.index, item]
    weighted_sum = np.dot(top_k_users, ratings)
    return weighted_sum / np.abs(top_k_users).sum() if np.abs(top_k_users).sum() > 0 else np.nan


# Predict for test set
print("\nPredicting using user-based CF...")
user_cf_preds = []
for _, row in tqdm(test.iterrows(), total=len(test)):
    pred = user_based_cf_predict(row['user_id'], row['item_id'])
    user_cf_preds.append(pred)

test['user_cf_pred'] = user_cf_preds
test['user_cf_pred'].fillna(test['baseline_pred'], inplace=True)  # fallback


# ============================================================
# 4. Correlation-based Collaborative Filtering (Item-based)
# ============================================================

# Compute Pearson correlation between items
item_corr = rating_matrix.corr(min_periods=5)

def item_based_cf_predict(user, item, k=10):
    if item not in rating_matrix.columns or user not in rating_matrix.index:
        return np.nan

    # Get items this user has rated
    user_ratings = rating_matrix.loc[user].dropna()
    if user_ratings.empty:
        return np.nan

    # Similarity between this item and others the user has rated
    sim_scores = item_corr.loc[item, user_ratings.index].dropna()
    if sim_scores.empty:
        return np.nan

    # Top-k similar items
    top_k_items = sim_scores.nlargest(k)
    weighted_sum = np.dot(top_k_items, user_ratings[top_k_items.index])
    return weighted_sum / np.abs(top_k_items).sum() if np.abs(top_k_items).sum() > 0 else np.nan


print("\nPredicting using item-based CF...")
item_cf_preds = []
for _, row in tqdm(test.iterrows(), total=len(test)):
    pred = item_based_cf_predict(row['user_id'], row['item_id'])
    item_cf_preds.append(pred)

test['item_cf_pred'] = item_cf_preds
test['item_cf_pred'].fillna(test['user_cf_pred'], inplace=True)  # fallback

# ============================================================
# 5. Final Ensemble & Submission
# ============================================================

# Simple average of all 3 methods (baseline, user-CF, item-CF)
test['final_pred'] = (
    0.3 * test['baseline_pred'] +
    0.35 * test['user_cf_pred'] +
    0.35 * test['item_cf_pred']
)

# Prepare submission file
submission = pd.DataFrame({
    'user_id': test['user_id'],
    'item_id': test['item_id'],
    'rating': test['final_pred']
})

submission.to_csv("submission.csv", index=False)
print("\n✅ submission.csv saved successfully!")
print(submission.head())
