In [59]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

import sys
from pathlib import Path

In [60]:
ratings = pd.read_csv('../data/ratings.csv') 
movies = pd.read_csv('../data/movies.csv')   

In [61]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [63]:
sys.path.append(str(Path("../utils/preprocess_title.py").resolve().parent.parent / "utils"))
from preprocess_title import preprocess_title

#### Data Preprocessing

In [64]:
print("Unique ratings:", ratings['rating'].unique())

Unique ratings: [4.  5.  3.  2.  1.  4.5 3.5 2.5 0.5 1.5]


In [65]:
def split_title_year(title):
    match = re.search(r'\((\d{4})\)', title)
    year = match.group(1) if match else None
    name = re.sub(r'\s*\(\d{4}\)', '', title)
    return name.strip(), year

##### Split title into name , year (NOT-USED)

In [66]:
movies[['name', 'year']] = movies['title'].apply(lambda x: pd.Series(split_title_year(x)))

##### Handel title

##### Handel genres + tfidf

In [67]:
movies['clean_name'] = movies['name'].apply(preprocess_title)

movies['genres'] = movies['genres'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

##### Cosin_sim

In [68]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#### content_based Model

In [69]:
def content_based_recommendations(movie_title, top_n=10):
    movie_title_clean = preprocess_title(movie_title)

    
    if movie_title_clean not in movies['clean_name'].values:
        print(f"Movie '{movie_title}' not found in dataset.")
        return pd.DataFrame()


    idx = movies[movies['clean_name'] == movie_title_clean].index[0] # Movie_ID
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1] # first Movie is the same movie
    movie_indices = [i[0] for i in sim_scores] # Get Movie_ID


    return movies.iloc[movie_indices][['title', 'movieId']]

#### collaborative Model

In [70]:
from surprise import SVD, Dataset, Reader
import pandas as pd
import joblib

# Load ratings
ratings = pd.read_csv("../data/ratings.csv")
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Train SVD model
svd = SVD()
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
svd.fit(trainset)

# Save model
joblib.dump(svd, "../models/svd_model.pkl")

['../models/svd_model.pkl']

In [71]:
def collaborative_recommendations(user_id, top_n=10):
    user_rated_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist() # get User rated_Movies
    movie_ids = movies['movieId'].unique()
    movies_to_predict = [mid for mid in movie_ids if mid not in user_rated_movies]
    predictions = []

    for movie_id in movies_to_predict:
        pred = svd.predict(user_id, movie_id)
        predictions.append((movie_id, pred.est))

    predictions.sort(key=lambda x: x[1], reverse=True)
    top_preds = predictions[:top_n]
    movie_ids = [x[0] for x in top_preds]

    return movies[movies['movieId'].isin(movie_ids)][['title', 'movieId']]

#### hybrid Model

In [72]:

def hybrid_recommendations(user_id, movie_title, top_n=10, content_weight=0.5, collab_weight=0.5):
    content_recs = content_based_recommendations(movie_title, top_n=50)

    if content_recs.empty:
        return pd.DataFrame()

    movie_ids = content_recs['movieId'].tolist()
    collab_scores = []

    for movie_id in movie_ids:
        pred = svd.predict(user_id, movie_id)
        collab_scores.append(pred.est)
    
    collab_scores_norm = (np.array(collab_scores) - np.min(collab_scores)) / (np.ptp(collab_scores) + 1e-8)
    content_scores_norm = (np.array([cosine_sim[movies[movies['clean_name']==preprocess_title(movie_title)].index[0], movies[movies['movieId']==mid].index[0]] for mid in movie_ids]) - 0) / 1
    hybrid_scores = content_weight*content_scores_norm + collab_weight*collab_scores_norm
    
    rec_df = content_recs.copy()
    rec_df['score'] = hybrid_scores
    rec_df = rec_df.sort_values(by='score', ascending=False).head(top_n)
    
    return rec_df[['title', 'movieId']]

#### Error Calc

In [73]:
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions, verbose=False)
mae = accuracy.mae(predictions, verbose=False)

In [74]:
def precision_recall_f1(predictions, threshold=3.5):
    y_true = [pred.r_ui >= threshold for pred in predictions]
    y_pred = [pred.est >= threshold for pred in predictions]
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1

precision, recall, f1 = precision_recall_f1(predictions)

In [75]:
user_id = 15
movie_title = "Toy Story"

print("Content-Based Recommendations:")
print(content_based_recommendations(movie_title))

print("\nCollaborative Filtering Recommendations:")
print(collaborative_recommendations(user_id))

print("\nHybrid Recommendations:")
print(hybrid_recommendations(user_id, movie_title))

print(f"\nEvaluation Metrics:\nRMSE: {rmse:.4f}\nMAE: {mae:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1 Score: {f1:.4f}")

Content-Based Recommendations:
                                                  title  movieId
1706                                        Antz (1998)     2294
2355                                 Toy Story 2 (1999)     3114
2809     Adventures of Rocky and Bullwinkle, The (2000)     3754
3000                   Emperor's New Groove, The (2000)     4016
3568                              Monsters, Inc. (2001)     4886
6194                                   Wild, The (2006)    45074
6486                             Shrek the Third (2007)    53121
6948                     Tale of Despereaux, The (2008)    65577
7760  Asterix and the Vikings (Astérix et les Viking...    91355
8219                                       Turbo (2013)   103755

Collaborative Filtering Recommendations:
                                                  title  movieId
602   Dr. Strangelove or: How I Learned to Stop Worr...      750
742                           African Queen, The (1951)      969
878      Cinema P

In [76]:
np.save("../models/cosine_sim.npy", cosine_sim)
print("✔️ Cosine similarity matrix saved as cosine_sim.npy")


✔️ Cosine similarity matrix saved as cosine_sim.npy
