In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import lightgbm as lgb
from sklearn.metrics import ndcg_score

In [15]:
# === Step 1: Load Data ===
ratings = pd.read_csv('ratings.csv',)
movies = pd.read_csv('movies.csv',)

In [16]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [13]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
# === Step 2: Preprocess genres ===
movies['genres_list'] = movies['genres'].str.split('|')
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies['genres_list'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
movies = pd.concat([movies, genres_df], axis=1)
movies.head()

Unnamed: 0,movieId,title,genres,genres_list,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# === Step 3: Merge datasets ===
df = pd.merge(ratings, movies, on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,genres_list,(no genres listed),Action,Adventure,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,"[Action, Crime, Thriller]",0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,"[Mystery, Thriller]",0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,"[Crime, Mystery, Thriller]",0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [20]:
# === Step 4: Feature Engineering ===
# Create basic user/movie stats (optional)
user_stats = df.groupby('userId')['rating'].agg(['mean', 'count']).reset_index()
user_stats.columns = ['userId', 'user_mean_rating', 'user_rating_count']
df = df.merge(user_stats, on='userId')

In [21]:
movie_stats = df.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()
movie_stats.columns = ['movieId', 'movie_mean_rating', 'movie_rating_count']
df = df.merge(movie_stats, on='movieId')

In [22]:
# === Step 5: Prepare features and labels ===
features = ['user_mean_rating', 'user_rating_count', 
            'movie_mean_rating', 'movie_rating_count'] + list(mlb.classes_)

X = df[features]
y = df['rating']
group = df.groupby('userId').size().tolist()

In [28]:
# === Step 6: Group-Based Train/Test Split ===
unique_users = df['userId'].unique()
train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

train_df = df[df['userId'].isin(train_users)]
test_df = df[df['userId'].isin(test_users)]

X_train = train_df[features]
y_train = train_df['rating'].apply(lambda x: int(x))
X_test = test_df[features]
y_test = test_df['rating'].apply(lambda x: int(x))

In [29]:
# Group info (number of items per user)
group_train = train_df.groupby('userId').size().tolist()
group_test = test_df.groupby('userId').size().tolist()

In [30]:
# === Step 7: Train LightGBM Ranker ===
lgb_train = lgb.Dataset(X_train, label=y_train, group=group_train)
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [5, 10],
    'learning_rate': 0.05,
    'verbose': -1
}
model = lgb.train(params, lgb_train, num_boost_round=100)

In [31]:
# === Step 8: Evaluate ===
y_pred = model.predict(X_test)

# NDCG (simplified global computation; per-user is more accurate)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG Score: {ndcg:.4f}")

NDCG Score: 0.9864


In [33]:
# === Step 9: Predict Top-N Movies for a User ===
def recommend_movies(user_id, top_n=5):
    # Movies the user has rated
    rated_movie_ids = df[df['userId'] == user_id]['movieId'].unique()
    
    # Get all unseen movies
    unseen_movies = df[~df['movieId'].isin(rated_movie_ids)][['movieId'] + features].drop_duplicates('movieId')

    # Add user-specific stats to these movies
    user_row = df[df['userId'] == user_id].iloc[0]
    unseen_movies['userId'] = user_id
    unseen_movies['user_mean_rating'] = user_row['user_mean_rating']
    unseen_movies['user_rating_count'] = user_row['user_rating_count']

    # Predict
    X_unseen = unseen_movies[features]
    preds = model.predict(X_unseen)
    unseen_movies['predicted_rating'] = preds

    # Join with movie titles
    movie_titles = movies[['movieId', 'title']].drop_duplicates()
    unseen_movies = unseen_movies.merge(movie_titles, on='movieId', how='left')

    # Return top-N recommendations
    top_movies = unseen_movies.sort_values(by='predicted_rating', ascending=False)[['title', 'predicted_rating']].head(top_n)
    return top_movies


# Example:
print("Top recommendations for user 1:")
print(recommend_movies(1))

Top recommendations for user 1:
                                                  title  predicted_rating
9389             Love Exposure (Ai No Mukidashi) (2008)          4.814837
4212           Priklyucheniya Kapitana Vrungelya (1979)          4.769957
9344          Last Hurrah for Chivalry (Hao xia) (1979)          4.769957
7438  Battle Royale 2: Requiem (Batoru rowaiaru II: ...          4.769957
6954                                Wonder Woman (2009)          4.769957
