In [66]:
# Imports 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV
from math import sqrt
import os
import re

# Movie Recommendation for User Pair

In this notebook, we will recommend movies for a pair of users using a pre-trained SVD model. The recommended movies will be sorted by their couple score in descending order.


In [118]:
folder = '/kaggle/input/dataset2/destination_folder/'
os.environ['OMP_NUM_THREADS'] = '4'

# Load Datasets

 Load the datasets ratings.dat, movies.dat, and title.basics.tsv into pandas DataFrames using pd.read_csv(). This step ensures the data is in a format suitable for preprocessing and analysis.

In [133]:
users = pd.read_csv(folder + "users.dat", sep="::", header=None, names=["UserID","Gender","Age","Occupation","Zip-code"], engine='python', encoding='latin1')
ratings = pd.read_csv(folder + "ratings.dat", sep="::", header=None, names=["UserID","MovieID","Rating","Timestamp"], engine='python', encoding='latin1') 
movies = pd.read_csv(folder + 'movies.dat', sep='::', header=None, names=["MovieID","Title","Genres"], engine='python', encoding='latin1')
title_basics = pd.read_csv(folder + 'title.basics.tsv', sep='\t', dtype=dtype_dict)


# Data Preprocessing




In [123]:
# Ingénierie des caractéristiques
movies['Year'] = movies['Title'].str.extract(r'.*\((\d{4})\).*', expand=True)
movies['Year'] = movies['Year'].fillna(0).astype(int)
movies['Title'] = movies['Title'].apply(lambda x: x.split('(')[0].strip()).str.lower()
movies['Title'] = movies['Title'].apply(lambda x: x.replace("&", "and"))
movies['Title'] = movies['Title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')


This cell preprocesses the loaded data by extracting and normalizing relevant features. It includes steps like extracting the year from the movie titles, normalizing the movie titles to lowercase, and handling missing values in the title.basics.tsv data.

In [138]:
# Normaliser les colonnes de title_basics
title_basics = title_basics[['tconst', 'primaryTitle', 'startYear', 'genres']]
title_basics.rename(columns={'tconst': 'MovieID_IMDB', 'primaryTitle': 'Title', 'startYear': 'Year_IMDB', 'genres': 'Genres_IMDB'}, inplace=True)

# Gérer les valeurs manquantes avant d'appliquer des transformations de chaîne de caractères
title_basics['Title'] = title_basics['Title'].fillna('').str.lower()
title_basics['Title'] = title_basics['Title'].apply(lambda x: x.replace("&", "and"))
title_basics['Title'] = title_basics['Title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_basics.rename(columns={'tconst': 'MovieID_IMDB', 'primaryTitle': 'Title', 'startYear': 'Year_IMDB', 'genres': 'Genres_IMDB'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_basics['Title'] = title_basics['Title'].fillna('').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_basics['Title'] = title_basics['Title'].a

In [137]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10906498,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10906499,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10906500,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10906501,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


# Data Merging

Merges the movies.dat data with the title.basics.tsv data to create a comprehensive dataset that includes additional metadata like genres and start year from IMDb.

In [139]:
# Fusionner les datasets
merged_movies = pd.merge(movies, title_basics, how='left', on='Title')

# Assurer que les MovieID sont cohérents et garder une seule colonne MovieID
if 'MovieID_x' in merged_movies.columns:
    merged_movies = merged_movies.rename(columns={'MovieID_x': 'MovieID'})
    merged_movies.drop(columns=['MovieID_y'], inplace=True)

# Fusionner les données de notation avec les informations de films enrichies
merged_data = pd.merge(ratings, merged_movies, on='MovieID')


# Model Training with GridSearchCV
1. -creates a user-item interaction matrix from the merged dataset, which will be used for collaborative filtering.
1. -splits the data into training and testing sets to evaluate the performance of the recommendation model
1. -uses the Surprise library to train an SVD (Singular Value Decomposition) model with hyperparameter tuning using GridSearchCV. It finds the best hyperparameters to optimize the model's performance.

In [140]:
# Utiliser un échantillon plus petit
sampled_data = merged_data.sample(frac=0.5, random_state=42)
train, test = train_test_split(sampled_data, test_size=0.2)

# Créer la matrice utilisateur-élément
user_item_matrix = sampled_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

# Utiliser la bibliothèque Surprise pour la factorisation de matrice
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train[['UserID', 'MovieID', 'Rating']], reader)

# Réduire l'espace de recherche des paramètres et utiliser moins de plis de validation croisée
param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.1, 0.2]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=2)
gs.fit(data)

# Best SVD model

In [141]:
# Meilleur modèle SVD
best_svd = gs.best_estimator['rmse']
trainset = data.build_full_trainset()
best_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c75c7770070>

# Evaluation function


In [142]:
def evaluate_model(train, test, model):
    testset = list(zip(test['UserID'], test['MovieID'], test['Rating']))
    predictions = model.test(testset)
    y_true = [pred.r_ui for pred in predictions]
    y_pred = [pred.est for pred in predictions]
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    print(f"RMSE: {rmse}")


#  Evaluate the model
Evaluate the trained SVD model using the RMSE metric on the test data. This step measures how well the model's predictions match the actual ratings.


In [143]:
evaluate_model(train, test, best_svd)


RMSE: 0.8913807274549974


# Recommendation function


defines a function recommend_movie_for_user_pair that takes two user IDs and the trained SVD model to recommend movies based on the average predicted ratings for the two users.

In [164]:
def recommend_movie_for_user_pair(user1, user2, model, user_item_matrix, movies_df, top_n=100):
    all_movie_ids = user_item_matrix.columns
    
    # Prédictions pour tous les films pour les deux utilisateurs
    user1_predictions = [model.predict(user1, movie_id).est for movie_id in all_movie_ids]
    user2_predictions = [model.predict(user2, movie_id).est for movie_id in all_movie_ids]
    
    # Moyenne des prédictions des deux utilisateurs
    average_ratings = np.mean([user1_predictions, user2_predictions], axis=0)
    
    # Obtenir les indices des meilleurs films
    top_movie_indices = np.argsort(-average_ratings)[:top_n]
    
    recommended_movies = [(all_movie_ids[i], average_ratings[i]) for i in top_movie_indices]
    return recommended_movies


# Testing recommending a movie for a user pair


generates and prints the top 10 recommended movies for a pair of users (e.g., user1 and user2). It sorts the recommendations by the predicted couple score in descending order.

In [173]:
user1 = 2
user2 = 2
recommended_movies = recommend_movie_for_user_pair(user1, user2, best_svd, user_item_matrix, merged_movies)

# Trier les films recommandés par le score du couple du plus grand au plus petit
recommended_movies = sorted(recommended_movies, key=lambda x: x[1], reverse=True)

print(f"Top 10 des films recommandés pour les utilisateurs {user1} et {user2}:")
for movie in recommended_movies[:10]:
    movie_id = movie[0]
    movie_title = merged_movies[merged_movies['MovieID'] == movie_id]['Title'].values[0]
    print(f'Film: {movie_id}, Titre: {movie_title}, Score du couple: {movie[1]}')


Top 10 des films recommandés pour les utilisateurs 2 et 2:
Film: 318, Titre: Shawshank Redemption, The (1994), Score du couple: 4.478165735578333
Film: 2905, Titre: Sanjuro (1962), Score du couple: 4.411687274462539
Film: 3338, Titre: For All Mankind (1989), Score du couple: 4.411553399343139
Film: 1423, Titre: Hearts and Minds (1996), Score du couple: 4.362003175491189
Film: 527, Titre: Schindler's List (1993), Score du couple: 4.361438592825964
Film: 1148, Titre: Wrong Trousers, The (1993), Score du couple: 4.301715056522557
Film: 2503, Titre: Apple, The (Sib) (1998), Score du couple: 4.297458736044591
Film: 720, Titre: Wallace & Gromit: The Best of Aardman Animation (1996), Score du couple: 4.28429876554798
Film: 2762, Titre: Sixth Sense, The (1999), Score du couple: 4.272060840955527
Film: 953, Titre: It's a Wonderful Life (1946), Score du couple: 4.2674022056251415
