In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load and preprocess data
ratings = pd.read_csv('../data/ratings_clean.csv')

# Split data into train/test sets
train_data, test_data = train_test_split(
    ratings,
    test_size=0.2,
    stratify=ratings['user_id'],
    random_state=42
)

# Create item-user matrix from training data
user_item_matrix = train_data.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating'
).fillna(0)
item_user_matrix = user_item_matrix.T  # Items as rows, users as columns

# Calculate item-item similarity matrix
item_similarity = cosine_similarity(item_user_matrix)

# Define prediction function with fallbacks (updated)
def predict_item_rating(user_id, movie_id, k=10):
    try:
        # Check if movie exists in training data
        if movie_id not in item_user_matrix.index:
            return np.nan
        
        # Get the matrix index of the movie
        movie_idx = item_user_matrix.index.get_loc(movie_id)
        
        # Get top-k similar items (excluding self)
        similar_items = np.argsort(item_similarity[movie_idx])[::-1][1:k+1]
        similar_movie_ids = item_user_matrix.index[similar_items]  # Actual movie IDs
        
        # Get the user's ratings for similar movies
        similar_ratings = user_item_matrix.loc[user_id, similar_movie_ids]
        
        # Filter out zero ratings
        valid_ratings = similar_ratings[similar_ratings != 0]
        
        # Fallback 1: Average of similar items' ratings
        if len(valid_ratings) > 0:
            return valid_ratings.mean()
        
        # Fallback 2: Item's average rating
        item_avg = item_user_matrix.loc[movie_id].mean()
        if not np.isnan(item_avg):
            return item_avg
        
        # Fallback 3: User's average rating
        user_avg = user_item_matrix.loc[user_id].mean()
        if not np.isnan(user_avg):
            return user_avg
        
        # Fallback 4: Global average rating
        return train_data['rating'].mean()
    
    except KeyError:
        return np.nan

# Filter test data to include only known users/items
valid_test_data = test_data[
    (test_data['user_id'].isin(user_item_matrix.index)) &
    (test_data['movie_id'].isin(item_user_matrix.index))
]

print(f"Testing on {len(valid_test_data)} valid samples")

# Generate predictions
actual = []
predicted = []

for _, row in valid_test_data.iterrows():
    user_id = row['user_id']
    movie_id = row['movie_id']
    true_rating = row['rating']
    
    pred_rating = predict_item_rating(user_id, movie_id)
    
    if not np.isnan(pred_rating):
        actual.append(true_rating)
        predicted.append(pred_rating)
    else:
        print(f"Skipped prediction for user {user_id}, movie {movie_id}")

# Calculate metrics
if len(actual) > 0:
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    print(f"\nItem-Based Collaborative Filtering RMSE: {rmse:.2f}")
    print(f"Item-Based Collaborative Filtering MAE: {mae:.2f}")
else:
    print("Error: No valid predictions generated")

# Sample predictions
print("\nSample Predictions:")
for i in range(min(5, len(actual))):
    print(f"User {valid_test_data.iloc[i]['user_id']} -> Movie {valid_test_data.iloc[i]['movie_id']}: "
          f"Predicted {predicted[i]:.1f} vs Actual {actual[i]}")

Testing on 19972 valid samples

Item-Based Collaborative Filtering RMSE: 1.33
Item-Based Collaborative Filtering MAE: 0.94

Sample Predictions:
User 101 -> Movie 1057: Predicted 3.0 vs Actual 2
User 608 -> Movie 448: Predicted 5.0 vs Actual 5
User 230 -> Movie 228: Predicted 3.5 vs Actual 2
User 822 -> Movie 539: Predicted 4.0 vs Actual 2
User 290 -> Movie 651: Predicted 4.6 vs Actual 3
