In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from scipy.sparse import csr_matrix


In [2]:
df_movies = pd.read_csv("movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
df_ratings = pd.read_csv("ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1247,192,3.0,842870972
1,2006,1198,1.0,1503224680
2,1301,1653,3.0,1171786660
3,572,2802,4.0,985384769
4,3212,427,1.0,847669137


In [4]:
df_tags = pd.read_csv("tags.csv")
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [5]:
df_links = pd.read_csv("links.csv")
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
#Merge ratings and movie
df_movie_ratings = pd.merge(df_ratings, df_movies, how="left", on="movieId")
df_movie_ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1247,192,3.0,842870972,The Show (1995),Documentary
1,2006,1198,1.0,1503224680,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
2,1301,1653,3.0,1171786660,Gattaca (1997),Drama|Sci-Fi|Thriller
3,572,2802,4.0,985384769,Tequila Sunrise (1988),Action|Drama|Romance|Thriller
4,3212,427,1.0,847669137,Boxing Helena (1993),Drama|Mystery|Romance|Thriller


In [7]:
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030396 entries, 0 to 1030395
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1030396 non-null  int64 
 1   movieId    1030396 non-null  int64 
 2   tag        1030380 non-null  object
 3   timestamp  1030396 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 31.4+ MB


In [8]:
df_tags.isnull().sum()

userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

In [9]:
#clean to Tags and grouping
df_tags['tag'] = df_tags['tag'].astype(str).replace('nan', np.nan)

In [10]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [11]:
df_movie_tags = (df_tags.dropna(subset=['tag']).groupby('movieId')['tag'].apply(lambda x: ' '.join(x))
    .reset_index()
)

In [12]:
df_movie_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28825 entries, 0 to 28824
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  28825 non-null  int64 
 1   tag      28825 non-null  object
dtypes: int64(1), object(1)
memory usage: 450.5+ KB


In [13]:
#create main dataframe
df_final = pd.merge(df_movie_ratings, df_movie_tags, how="left", on="movieId")
df_final = df_final.rename(columns={"tag": "all_tags"})

In [14]:
df_final.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,all_tags
0,1247,192,3.0,842870972,The Show (1995),Documentary,
1,2006,1198,1.0,1503224680,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,own imdb top 250 adventure comedy good versus ...
2,1301,1653,3.0,1171786660,Gattaca (1997),Drama|Sci-Fi|Thriller,dystopia dystopic future future futuristic gen...
3,572,2802,4.0,985384769,Tequila Sunrise (1988),Action|Drama|Romance|Thriller,Mel Gibson best friend california cop drug dea...
4,3212,427,1.0,847669137,Boxing Helena (1993),Drama|Mystery|Romance|Thriller,captivity car accident doctor dream kidnapping...


In [15]:
df_final.isnull().sum()

userId          0
movieId         0
rating          0
timestamp       0
title           0
genres          0
all_tags     9856
dtype: int64

In [16]:
#add tmdbID for nan tags
df_final = pd.merge(df_final, df_links[['movieId', 'tmdbId']], on='movieId', how='left')

In [17]:
df_final.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,all_tags,tmdbId
0,1247,192,3.0,842870972,The Show (1995),Documentary,,56088.0
1,2006,1198,1.0,1503224680,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,own imdb top 250 adventure comedy good versus ...,85.0
2,1301,1653,3.0,1171786660,Gattaca (1997),Drama|Sci-Fi|Thriller,dystopia dystopic future future futuristic gen...,782.0
3,572,2802,4.0,985384769,Tequila Sunrise (1988),Action|Drama|Romance|Thriller,Mel Gibson best friend california cop drug dea...,10396.0
4,3212,427,1.0,847669137,Boxing Helena (1993),Drama|Mystery|Romance|Thriller,captivity car accident doctor dream kidnapping...,18215.0


In [18]:
# finding nan tags
df_missing_info = df_final[df_final['all_tags'].isnull()].copy()
df_missing_info = df_missing_info[['movieId', 'title', 'tmdbId']].drop_duplicates(subset=['movieId'])
df_missing_info['tmdbId'] = df_missing_info['tmdbId'].fillna(-1).astype(int)
df_missing_for_api = df_missing_info[df_missing_info['tmdbId'] != -1]

In [19]:
"""This script fetches missing movie tag data from the TMDB API and fills empty `all_tags`
fields in the dataset to enrich content-based information for the hybrid recommender system.
"""
import time
import requests
import gc



# TMDB API 
API_KEY = "API_KEY"
BASE_URL = "https://api.themoviedb.org/3/movie/"

def fetch_tmdb_tags(tmdb_id, max_retries=3):
    
    if pd.isna(tmdb_id) or tmdb_id <= 0:
        return ""
    
    url = f"{BASE_URL}{int(tmdb_id)}/keywords?api_key={API_KEY}"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=5)
            response.raise_for_status()
            data = response.json()
            keywords = [k['name'].lower().replace(' ', '_') for k in data.get('keywords', [])]
            return ' '.join(keywords)
        except:
            if attempt < max_retries - 1:
                time.sleep(1)
            else:
                return ""

# find movies with missing tags
df_missing_info = df_final[df_final['all_tags'].isnull() | (df_final['all_tags'] == '')].copy()
df_missing_for_api = df_missing_info[['movieId', 'tmdbId']].drop_duplicates(subset=['movieId']).reset_index(drop=True)


df_missing_for_api['tmdbId'] = pd.to_numeric(df_missing_for_api['tmdbId'], errors='coerce').fillna(-1).astype(int)
df_missing_for_api = df_missing_for_api[df_missing_for_api['tmdbId'] != -1]

print(f"Movies with missing tags (TMDB available): {len(df_missing_for_api)}")

fetched_tags_list = []
BATCH_SIZE = 100
start_time = time.time()

for idx, row in df_missing_for_api.iterrows():
    tags = fetch_tmdb_tags(row['tmdbId'])
    fetched_tags_list.append(tags)
    
    if (idx + 1) % BATCH_SIZE == 0:
        elapsed = time.time() - start_time
        print(f"-> {idx + 1}/{len(df_missing_for_api)} movies processed from TMDB ({elapsed:.1f} s)")


df_missing_for_api['fetched_tags'] = fetched_tags_list


df_final['all_tags'] = df_final['all_tags'].fillna('')


tag_map = df_missing_for_api.set_index('movieId')['fetched_tags'].to_dict()


mask = (df_final['all_tags'] == '') | (df_final['all_tags'].isnull())


new_tags = df_final.loc[mask, 'movieId'].map(tag_map).fillna('')


df_final.loc[mask, 'all_tags'] = new_tags

del df_missing_info, df_missing_for_api, tag_map, fetched_tags_list, new_tags
gc.collect()

Movies with missing tags (TMDB available): 5367
-> 100/5367 movies processed from TMDB (16.6 s)
-> 200/5367 movies processed from TMDB (35.9 s)
-> 300/5367 movies processed from TMDB (57.2 s)
-> 400/5367 movies processed from TMDB (77.3 s)
-> 500/5367 movies processed from TMDB (91.7 s)
-> 600/5367 movies processed from TMDB (108.6 s)
-> 700/5367 movies processed from TMDB (125.7 s)
-> 800/5367 movies processed from TMDB (147.2 s)
-> 900/5367 movies processed from TMDB (162.4 s)
-> 1000/5367 movies processed from TMDB (178.8 s)
-> 1100/5367 movies processed from TMDB (197.5 s)
-> 1200/5367 movies processed from TMDB (223.4 s)
-> 1300/5367 movies processed from TMDB (236.4 s)
-> 1400/5367 movies processed from TMDB (257.8 s)
-> 1500/5367 movies processed from TMDB (284.5 s)
-> 1600/5367 movies processed from TMDB (305.4 s)
-> 1700/5367 movies processed from TMDB (338.6 s)
-> 1800/5367 movies processed from TMDB (370.2 s)
-> 1900/5367 movies processed from TMDB (401.3 s)
-> 2000/5367 mov

0

In [20]:
import nltk
try:
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    nltk.download('punkt')
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))

stop_words.update(['movie', 'film', 'based', 'story'])

def clean_text(text):
  
    if pd.isna(text) or text == '':
        return ""
    
    
    text = re.sub(r'[^a-z\s]', ' ', text.lower())
    
    # Tokenize ve filtring
    words = [w for w in text.split() if w not in stop_words and len(w) > 2]
    
    return " ".join(words)

# 
df_final["all_tags_cleaned"] = df_final["all_tags"].apply(clean_text)
df_final["genres_processed"] = df_final["genres"].str.replace("|", " ", regex=False)

# Content features (Title + Genres + Tags)
df_final["content_features"] = (
    df_final["title"] + " " +
    df_final["genres_processed"] + " " +
    df_final["all_tags_cleaned"]
)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF vectors
df_movies_content = (df_final[["movieId", "content_features"]].drop_duplicates(subset=["movieId"])
    .reset_index(drop=True)
)

In [22]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,  # Min 2 word in movie 
    max_df=0.8  # Remove words that appear in more than 80% of the movies
)

In [23]:
tfidf_matrix = tfidf.fit_transform(df_movies_content["content_features"])
tfidf_matrix.shape

(34216, 5000)

In [24]:
#cosine sim item-item for cb item -item 
from sklearn.metrics.pairwise import cosine_similarity


cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [25]:
# MovieId -> Index mapping
movie_to_index = pd.Series(df_movies_content.index, index=df_movies_content["movieId"])
df_final["tfidf_index"] = df_final["movieId"].map(movie_to_index)


In [26]:
#Train Test Split for data leage
from sklearn.model_selection import train_test_split

user_counts = df_final['userId'].value_counts()
valid_users = user_counts[user_counts >= 5].index
df_filtered = df_final[df_final['userId'].isin(valid_users)].copy()

print(f"Original data: {len(df_final)}")
print(f"Filtered data: {len(df_filtered)}")
print(f"Eliminated data: {len(df_final) - len(df_filtered)}")

# Stratified split
train_df, test_df = train_test_split(
    df_filtered,
    test_size=0.2,
    random_state=42,
    stratify=df_filtered['userId']
)
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

Original data: 3000000
Filtered data: 2901799
Eliminated data: 98201
Train: (2321439, 12), Test: (580360, 12)


In [27]:
#Statistic for user and movie

user_stats = train_df.groupby('userId')['rating'].agg(['mean', 'count']).reset_index()
user_stats.columns = ['userId', 'user_avg_rating', 'user_rating_count']

movie_stats = train_df.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()
movie_stats.columns = ['movieId', 'movie_avg_rating', 'movie_rating_count']

# Global mean
global_mean = train_df['rating'].mean()

# 2. Train ve Test 
train_df = train_df.merge(user_stats, on='userId', how='left')
train_df = train_df.merge(movie_stats, on='movieId', how='left')

test_df = test_df.merge(user_stats, on='userId', how='left')
test_df = test_df.merge(movie_stats, on='movieId', how='left')


# User stat
train_df['user_avg_rating'] = train_df['user_avg_rating'].fillna(global_mean)
test_df['user_avg_rating'] = test_df['user_avg_rating'].fillna(global_mean)


#movie stat
train_df['movie_avg_rating'] = train_df['movie_avg_rating'].fillna(global_mean)
test_df['movie_avg_rating'] = test_df['movie_avg_rating'].fillna(global_mean)



In [28]:
#SCB (Semantic Content-Based) 
"""
This section builds personalized user profiles for the Semantic Content-Based (SCB) model
by aggregating TF-IDF movie vectors weighted by users' normalized ratings.
"""

train_user_means = train_df.groupby('userId')['rating'].mean()
user_profiles = {}

for user_id, grp in train_df.groupby('userId'):
    idx = grp['tfidf_index'].dropna().astype(int).values
    
    if len(idx) == 0:
        continue
    
    u_mean = train_user_means.loc[user_id]
    
    # Normalize
    ratings_norm = (grp.loc[grp['tfidf_index'].notna(), 'rating'].values - u_mean) / 4.5
    ratings_norm = ratings_norm.reshape(-1, 1)
    
    movie_vecs = tfidf_matrix[idx]
    user_vec = movie_vecs.multiply(ratings_norm).sum(axis=0)
    
    # Normalize
    vec_norm = np.linalg.norm(user_vec)
    if vec_norm > 0:
        user_vec = user_vec / vec_norm
    
    user_profiles[user_id] = np.asarray(user_vec).flatten()

print(f"{len(user_profiles)} user profile created for training data")

102699 user profile created for training data


In [29]:
def compute_scb_scores(df_part, profiles, tfidf_mat, batch_size=5000):
    """Content-Based"""
    scb_scores = []
    total_len = len(df_part)
    feature_dim = tfidf_mat.shape[1]
    
    for start in range(0, total_len, batch_size):
        end = min(start + batch_size, total_len)
        batch = df_part.iloc[start:end]
        
        user_ids = batch['userId'].values
        movie_indices = batch['tfidf_index'].fillna(-1).astype(int).values
        
        #Create user vectors
        u_batch = []
        for u in user_ids:
            if u in profiles:
                u_batch.append(profiles[u])
            else:
                u_batch.append(np.zeros(feature_dim))
        u_batch = np.array(u_batch)
        
        # create movie vector
        valid_mask = movie_indices >= 0
        m_batch = np.zeros((len(movie_indices), feature_dim))
        if valid_mask.any():
            m_batch[valid_mask] = tfidf_mat[movie_indices[valid_mask]].toarray()
        
        # Cosine sim
        u_norms = np.linalg.norm(u_batch, axis=1, keepdims=True)
        u_norms[u_norms == 0] = 1.0
        m_norms = np.linalg.norm(m_batch, axis=1, keepdims=True)
        m_norms[m_norms == 0] = 1.0
        
        u_batch_norm = u_batch / u_norms
        m_batch_norm = m_batch / m_norms
        scores = np.sum(u_batch_norm * m_batch_norm, axis=1)
        scb_scores.extend(scores)
        
        if (end % (batch_size * 5)) == 0:
            print(f"-> {end}/{total_len}")
    
    return scb_scores


train_df['SCB_score'] = compute_scb_scores(train_df, user_profiles, tfidf_matrix)


test_df['SCB_score'] = compute_scb_scores(test_df, user_profiles, tfidf_matrix)


-> 25000/2321439
-> 50000/2321439
-> 75000/2321439
-> 100000/2321439
-> 125000/2321439
-> 150000/2321439
-> 175000/2321439
-> 200000/2321439
-> 225000/2321439
-> 250000/2321439
-> 275000/2321439
-> 300000/2321439
-> 325000/2321439
-> 350000/2321439
-> 375000/2321439
-> 400000/2321439
-> 425000/2321439
-> 450000/2321439
-> 475000/2321439
-> 500000/2321439
-> 525000/2321439
-> 550000/2321439
-> 575000/2321439
-> 600000/2321439
-> 625000/2321439
-> 650000/2321439
-> 675000/2321439
-> 700000/2321439
-> 725000/2321439
-> 750000/2321439
-> 775000/2321439
-> 800000/2321439
-> 825000/2321439
-> 850000/2321439
-> 875000/2321439
-> 900000/2321439
-> 925000/2321439
-> 950000/2321439
-> 975000/2321439
-> 1000000/2321439
-> 1025000/2321439
-> 1050000/2321439
-> 1075000/2321439
-> 1100000/2321439
-> 1125000/2321439
-> 1150000/2321439
-> 1175000/2321439
-> 1200000/2321439
-> 1225000/2321439
-> 1250000/2321439
-> 1275000/2321439
-> 1300000/2321439
-> 1325000/2321439
-> 1350000/2321439
-> 1375000/23214

In [31]:
#Collaborative Filtering
"""
This section applies Collaborative Filtering using an SVD-based model to learn latent
user–item interactions and generate predicted ratings for both training and test data.
"""

!pip install scikit-surprise
!conda install -c conda-forge scikit-surprise -y
from surprise import SVD, Dataset, Reader

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

svd_model = SVD(
    n_factors=150,  # Latent fak
    n_epochs=30,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)
svd_model.fit(trainset)



# SVD predicts
def predict_cf(row):
    return svd_model.predict(row['userId'], row['movieId']).est

train_df['CF_score'] = train_df.apply(predict_cf, axis=1)

test_df['CF_score'] = test_df.apply(predict_cf, axis=1)

zsh:1: command not found: conda


In [37]:
from xgboost import XGBRegressor

In [38]:
#Features for XGboost 
features = [
    'SCB_score', 
    'CF_score', 
    'user_avg_rating', 
    'movie_avg_rating',
]

In [39]:
available_features = [f for f in features if f in train_df.columns]
X_train = train_df[available_features].fillna(0)
y_train = train_df['rating']
X_test = test_df[available_features].fillna(0)
y_test = test_df['rating']


In [40]:
xgb_normal = XGBRegressor(
    n_estimators=3000, 
    learning_rate=0.01, 
    max_depth=4, 
    min_child_weight=6, 
    subsample=0.6, 
    colsample_bytree=0.6,
    reg_alpha=0.1, 
    reg_lambda=1.5, 
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    early_stopping_rounds=50
)
xgb_normal.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [41]:
from sklearn.model_selection import RandomizedSearchCV


param_grid = {
    'n_estimators': [2000, 3000],
    'learning_rate': [0.008, 0.01, 0.012],
    'max_depth': [3, 4],             
    'min_child_weight': [5, 6, 7],
    'subsample': [0.6, 0.7],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [1.5, 2.0]
}

xgb_base = XGBRegressor(random_state=42, tree_method='hist', n_jobs=-1)

random_search = RandomizedSearchCV(
    estimator=xgb_base, 
    param_distributions=param_grid,
    n_iter=15, 
    scoring='neg_root_mean_squared_error',
    cv=3, 
    verbose=1, 
    random_state=42, 
    n_jobs=-1
)
random_search.fit(X_train, y_train)

print(f"Best Params: {random_search.best_params_}")

# Final Tuned Modele
best_params = random_search.best_params_

best_params['n_estimators'] = 5000 

xgb_tuned = XGBRegressor(
    **best_params, 
    
    random_state=42,
    n_jobs=-1, 
    tree_method='hist', 
    early_stopping_rounds=50
)


xgb_tuned.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best Params: {'subsample': 0.7, 'reg_lambda': 2.0, 'reg_alpha': 0.1, 'n_estimators': 3000, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.008, 'colsample_bytree': 1.0}


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [42]:
# Make predictions for each model
y_train_pred_tuned = xgb_tuned.predict(X_train)
y_test_pred_tuned = xgb_tuned.predict(X_test)

y_train_pred_normal = xgb_normal.predict(X_train)
y_test_pred_normal = xgb_normal.predict(X_test)

y_train_pred_svd = X_train['CF_score']
y_test_pred_svd = X_test['CF_score']


In [43]:
#metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def get_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2


In [44]:
models_data = [
    ("Hybrid (Standard)", y_train_pred_normal, y_test_pred_normal),
    ("Hybrid (Tuned)", y_train_pred_tuned, y_test_pred_tuned),
    ("SVD (Baseline)", y_train_pred_svd, y_test_pred_svd),
]

results = []

for name, y_tr, y_te in models_data:
    tr_rmse, _, _ = get_metrics(y_train, y_tr)
    te_rmse, te_mae, te_r2 = get_metrics(y_test, y_te)
    
    results.append({
        "Model": name,
        "Train RMSE": tr_rmse,
        "Test RMSE": te_rmse,
        "Test MAE": te_mae,
        "Test R2": te_r2
    })

df_metrics = pd.DataFrame(results).sort_values(by="Test RMSE")
df_metrics


Unnamed: 0,Model,Train RMSE,Test RMSE,Test MAE,Test R2
0,Hybrid (Standard),0.366097,0.868945,0.665801,0.325409
1,Hybrid (Tuned),0.364038,0.877916,0.672544,0.311409
2,SVD (Baseline),0.452673,0.885549,0.677185,0.299383


In [45]:
print("Final model comparison:")
print(df_metrics.round(4))

Final model comparison:
               Model  Train RMSE  Test RMSE  Test MAE  Test R2
0  Hybrid (Standard)      0.3661     0.8689    0.6658   0.3254
1     Hybrid (Tuned)      0.3640     0.8779    0.6725   0.3114
2     SVD (Baseline)      0.4527     0.8855    0.6772   0.2994


In [46]:
from sklearn.metrics import ndcg_score

# Average ratings from training data
user_avg_map  = train_df.groupby("userId")["rating"].mean().to_dict()
movie_avg_map = train_df.groupby("movieId")["rating"].mean().to_dict()
global_mean   = float(train_df["rating"].mean())

# Movies already seen by each user
user_interacted = df_final.groupby("userId")["movieId"].apply(set).to_dict()

# All available movies
all_movie_ids = set(df_movies["movieId"].unique())

In [47]:
def scb_score_single(user_id, movie_id):
    # Computes content-based similarity score for one user–movie pair
    uvec = user_profiles.get(user_id)
    idx = movie_to_index.get(movie_id)

    if uvec is None or idx is None or pd.isna(idx):
        return 0.0

    mvec = tfidf_matrix[int(idx)]
    return float(mvec.dot(uvec).sum())


In [48]:
def build_hybrid_features(user_id, movie_id):
    cf_score = svd_model.predict(user_id, movie_id).est
    scb_score = scb_score_single(user_id, movie_id)

    user_avg = user_avg_map.get(user_id, global_mean)
    movie_avg = movie_avg_map.get(movie_id, global_mean)

    return np.array([scb_score, cf_score, user_avg, movie_avg], dtype=np.float32)


In [49]:
def score_svd(user_id, movie_id):
    return float(svd_model.predict(user_id, movie_id).est)

def score_hybrid(user_id, movie_id):
    x = build_hybrid_features(user_id, movie_id).reshape(1, -1)
    return float(xgb_normal.predict(x)[0])


In [53]:
import random 
def evaluate_sampled_topk(test_df, k=10, n_negatives=99, threshold=4.0, scorer=None, seed=42):
    rng = random.Random(seed)

    # Select one positive item per user
    pos_df = (
        test_df[test_df["rating"] >= threshold]
        .groupby("userId", group_keys=False)
        .apply(lambda x: x.sample(1, random_state=seed))
    )

    hits = []
    ndcgs = []
    used = 0

    for _, row in pos_df.iterrows():
        u = int(row["userId"])
        pos_item = int(row["movieId"])

        seen = user_interacted.get(u, set())
        candidates = list(all_movie_ids - seen)

        if len(candidates) < n_negatives:
            continue

        neg_items = rng.sample(candidates, n_negatives)
        items = [pos_item] + neg_items

        scores = [scorer(u, it) for it in items]

        # Top-K ranking
        top_k_idx = np.argsort(scores)[::-1][:k]
        top_k_items = [items[i] for i in top_k_idx]

        hit = 1 if pos_item in top_k_items else 0
        hits.append(hit)

        y_true = np.array([[1] + [0]*n_negatives])
        y_pred = np.array([scores])
        ndcgs.append(ndcg_score(y_true, y_pred, k=k))

        used += 1

    return {
        "evaluated_users": used,
        f"HitRatio@{k}": float(np.mean(hits)) if hits else 0.0,
        f"NDCG@{k}": float(np.mean(ndcgs)) if ndcgs else 0.0
    }


In [54]:
K = 10

svd_res = evaluate_sampled_topk(test_df, k=K, scorer=score_svd)
hyb_res = evaluate_sampled_topk(test_df, k=K, scorer=score_hybrid)

print("Sampled Top-K Evaluation Results")
print("SVD   :", svd_res)
print("Hybrid:", hyb_res)

summary = pd.DataFrame([
    {"Model": "SVD", **svd_res},
    {"Model": "Hybrid", **hyb_res}
])

summary[["Model", "evaluated_users", f"HitRatio@{K}", f"NDCG@{K}"]].round(4)


  .apply(lambda x: x.sample(1, random_state=seed))
  .apply(lambda x: x.sample(1, random_state=seed))


Sampled Top-K Evaluation Results
SVD   : {'evaluated_users': 85055, 'HitRatio@10': 0.4934571747692669, 'NDCG@10': 0.2893482263461343}
Hybrid: {'evaluated_users': 85055, 'HitRatio@10': 0.5054259008876609, 'NDCG@10': 0.3155932295685726}


Unnamed: 0,Model,evaluated_users,HitRatio@10,NDCG@10
0,SVD,85055,0.4935,0.2893
1,Hybrid,85055,0.5054,0.3156
