In [1]:
pip install recommenders

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
from recommenders.evaluation.python_evaluation import (
    rmse, mae, rsquared, exp_var,
    map_at_k, ndcg_at_k, precision_at_k, recall_at_k
)
from recommenders.models.surprise.surprise_utils import compute_ranking_predictions
import random
import gc

# Load Ratings Data
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')

# Data Preparation & Validation
required_columns = ['userId', 'movieId', 'rating']
if not all(col in ratings.columns for col in required_columns):
    raise ValueError("CSV must contain columns: userId, movieId, rating")

# Match Surprise's expected format
ratings = ratings.rename(columns={
    'userId': 'userID',
    'movieId': 'itemID'
})

In [3]:
# Data Exploration
print("\n Dataset Summary:")
print(f"Total ratings: {len(ratings):,}")
print(f"Unique users: {ratings['userID'].nunique():,}")
print(f"Unique movies: {ratings['itemID'].nunique():,}")

# plt.figure(figsize=(10, 5))
# sns.countplot(x='rating', data=ratings, palette='viridis')
# plt.title('Rating Distribution')
# plt.xlabel('Rating Score')
# plt.ylabel('Count')
# plt.show()


 Dataset Summary:
Total ratings: 26,024,289
Unique users: 270,896
Unique movies: 45,115


In [4]:
# Prepare Surprise Dataset
# Define rating scale
reader = Reader(rating_scale=(0.5, 5))

# Load data into Surprise format
data = Dataset.load_from_df(
    ratings[['userID', 'itemID', 'rating']], 
    reader=reader
)

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
# Model Training
print("\n Training SVD model...")
model = SVD(
    n_factors=100,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

model.fit(train)

# Generate Predictions
print("\n Generating predictions...")
test_pred = model.test(test)
test_df = pd.DataFrame(test_pred)



 Training SVD model...

 Generating predictions...


In [6]:
TOP_K = 10
SAMPLE_SIZE = 1000

train_df = pd.DataFrame(train.all_ratings(), columns=['uid_inner', 'iid_inner', 'rating'])
train_df['userID'] = train_df['uid_inner'].apply(lambda x: train.to_raw_uid(int(x)))
train_df['itemID'] = train_df['iid_inner'].apply(lambda x: train.to_raw_iid(int(x)))

# Sample users for ranking metrics
all_users = train_df['userID'].unique()
sampled_users = random.sample(list(all_users), SAMPLE_SIZE)
train_sample = train_df[train_df['userID'].isin(sampled_users)]

test_sample_df = test_df[test_df['uid'].isin(sampled_users)]

rating_true = test_sample_df[['uid', 'iid', 'r_ui']].rename(columns={
    'uid': 'userID',
    'iid': 'itemID',
    'r_ui': 'rating'
})

rating_pred = test_sample_df[['uid', 'iid', 'est']].rename(columns={
    'uid': 'userID',
    'iid': 'itemID',
    'est': 'prediction'
})

# Generate top-K ranking predictions
print("\nGenerating top-K ranking predictions for sampled users...")
ranking_pred = compute_ranking_predictions(
    model,
    train_sample,
    usercol="userID",
    itemcol="itemID",
    remove_seen=True
)

# Evaluate metrics
print("\nEvaluating performance...")
metrics = {
    'RMSE': rmse(rating_true, rating_pred,
                 col_user='userID', col_item='itemID',
                 col_rating='rating', col_prediction='prediction'),

    'R²': rsquared(rating_true, rating_pred,
                   col_user='userID', col_item='itemID',
                   col_rating='rating', col_prediction='prediction'),

    'MAP@10': map_at_k(rating_true, ranking_pred, k=TOP_K,
                       col_user='userID', col_item='itemID',
                       col_prediction='prediction'),

    'NDCG@10': ndcg_at_k(rating_true, ranking_pred, k=TOP_K,
                         col_user='userID', col_item='itemID',
                         col_rating='rating', col_prediction='prediction'),

    'Precision@10': precision_at_k(rating_true, ranking_pred, k=TOP_K,
                                   col_user='userID', col_item='itemID',
                                   col_prediction='prediction'),

    'Recall@10': recall_at_k(rating_true, ranking_pred, k=TOP_K,
                             col_user='userID', col_item='itemID',
                             col_prediction='prediction')
}

for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Generating top-K ranking predictions for sampled users...

Evaluating performance...


  result = getattr(ufunc, method)(*inputs, **kwargs)


RMSE: 0.7971
R²: 0.4295
MAP@10: 0.0313
NDCG@10: 0.0654
Precision@10: 0.0526
Recall@10: 0.0335


In [7]:
def generate_topk_recommendations(model, trainset, all_items, user_list, k=10):
    """
    Generate top-k unseen item predictions for given users.
    Returns a DataFrame with columns: uid, iid, est
    """
    from collections import defaultdict

    # Get seen items from training data
    train_df = pd.DataFrame(trainset.all_ratings(), columns=['uid_inner', 'iid_inner', 'rating'])
    train_df['uid'] = train_df['uid_inner'].apply(lambda x: trainset.to_raw_uid(int(x)))
    train_df['iid'] = train_df['iid_inner'].apply(lambda x: trainset.to_raw_iid(int(x)))

    seen_items = defaultdict(set)
    for row in train_df.itertuples():
        seen_items[row.uid].add(row.iid)

    top_k_rows = []
    for uid in user_list:
        unseen = [iid for iid in all_items if iid not in seen_items[uid]]
        preds = [model.predict(uid, iid) for iid in unseen]
        top_k = sorted(preds, key=lambda x: x.est, reverse=True)[:k]

        for pred in top_k:
            top_k_rows.append({
                'uid': pred.uid,
                'iid': pred.iid,
                'est': pred.est
            })

    return pd.DataFrame(top_k_rows)
    
# Get a random user
sample_user = ratings['userID'].sample(1).values[0]

all_items = ratings['itemID'].unique()

top_k_df = generate_topk_recommendations(model, train, all_items, [sample_user], k=5)

print(f"\nSample Recommendations for User {sample_user}:")
print(top_k_df[['iid', 'est']].rename(columns={
    'iid': 'MovieID', 
    'est': 'Predicted Rating'
}).to_string(index=False))



Sample Recommendations for User 72209:
 MovieID  Predicted Rating
     296          5.000000
     858          4.882338
     111          4.753289
    2858          4.714357
     593          4.690136
