In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load and preprocess data
ratings = pd.read_csv('../data/ratings_clean.csv')

# Split data into train/test sets
train_data, test_data = train_test_split(
    ratings,
    test_size=0.2,
    stratify=ratings['user_id'],
    random_state=42
)

# Create user-item matrix from training data
user_item_matrix = train_data.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating'
).fillna(0)

# Calculate user-user similarity matrix
user_similarity = cosine_similarity(user_item_matrix)

# Define prediction function with fallback strategies
def predict_user_rating(user_id, movie_id, k=10):
    try:
        # Check if user exists in training data
        if user_id not in user_item_matrix.index:
            return np.nan
        
        # Get top-k similar users (excluding self)
        similar_users = np.argsort(user_similarity[user_id-1])[::-1][1:k+1]
        similar_ratings = user_item_matrix.iloc[similar_users][movie_id]
        
        # Filter out zero ratings
        valid_ratings = similar_ratings[similar_ratings != 0]
        
        # Fallback 1: Average of similar users' ratings
        if len(valid_ratings) > 0:
            return valid_ratings.mean()
        
        # Fallback 2: User's average rating
        user_avg = user_item_matrix.loc[user_id].mean()
        if not np.isnan(user_avg):
            return user_avg
        
        # Fallback 3: Global average rating
        return train_data['rating'].mean()
    
    except KeyError:
        return np.nan

# Filter test data to only include known users/movies
valid_test_data = test_data[
    (test_data['user_id'].isin(user_item_matrix.index)) & 
    (test_data['movie_id'].isin(user_item_matrix.columns))
]

print(f"Testing on {len(valid_test_data)} valid samples")

# Generate predictions
actual = []
predicted = []

for _, row in valid_test_data.iterrows():
    user_id = row['user_id']
    movie_id = row['movie_id']
    true_rating = row['rating']
    
    pred_rating = predict_user_rating(user_id, movie_id)
    
    if not np.isnan(pred_rating):
        actual.append(true_rating)
        predicted.append(pred_rating)
    else:
        print(f"Skipped prediction for user {user_id}, movie {movie_id}")

# Calculate evaluation metrics
if len(actual) > 0:
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    
    print("\nEvaluation Results:")
    print(f"User-Based Collaborative Filtering RMSE: {rmse:.2f}")
    print(f"User-Based Collaborative Filtering MAE: {mae:.2f}")
else:
    print("Error: No valid predictions generated")

# Optional: Show sample predictions
print("\nSample Predictions:")
for i in range(5):
    print(f"User {valid_test_data.iloc[i]['user_id']} -> Movie {valid_test_data.iloc[i]['movie_id']}: "
          f"Predicted {predicted[i]:.1f} vs Actual {actual[i]}")

Testing on 19972 valid samples

Evaluation Results:
User-Based Collaborative Filtering RMSE: 1.44
User-Based Collaborative Filtering MAE: 1.06

Sample Predictions:
User 101 -> Movie 1057: Predicted 0.1 vs Actual 2
User 608 -> Movie 448: Predicted 4.0 vs Actual 5
User 230 -> Movie 228: Predicted 3.8 vs Actual 2
User 822 -> Movie 539: Predicted 0.0 vs Actual 2
User 290 -> Movie 651: Predicted 4.7 vs Actual 3
