In [46]:
# Imports 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV
from math import sqrt
import os
import re


# Movie Recommendation for User Pair

In this notebook, we will recommend movies for a pair of users using a pre-trained SVD model. The recommended movies will be sorted by their couple score in descending order.


In [47]:
folder = '/kaggle/input/dataset2/destination_folder/'
os.environ['OMP_NUM_THREADS'] = '4'


# Load Datasets

 Load the datasets ratings.dat, movies.dat, and title.basics.tsv into pandas DataFrames using pd.read_csv(). This step ensures the data is in a format suitable for preprocessing and analysis.

In [48]:
users = pd.read_csv(folder + "users.dat", sep="::", header=None, names=["UserID","Gender","Age","Occupation","Zip-code"], engine='python', encoding='latin1')
ratings = pd.read_csv(folder + "ratings.dat", sep="::", header=None, names=["UserID","MovieID","Rating","Timestamp"], engine='python', encoding='latin1') 
movies = pd.read_csv(folder + 'movies.dat', sep='::', header=None, names=["MovieID","Title","Genres"], engine='python', encoding='latin1')
title_basics = pd.read_csv(folder + 'title.basics.tsv', sep='\t', dtype={
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'isAdult': str,
    'startYear': str,
    'endYear': str,
    'runtimeMinutes': str,
    'genres': str
})


# Data Preprocessing




In [49]:
# Ingénierie des caractéristiques
movies['Year'] = movies['Title'].str.extract(r'.*\((\d{4})\).*', expand=True)
movies['Year'] = movies['Year'].fillna(0).astype(int)
movies['Title'] = movies['Title'].apply(lambda x: x.split('(')[0].strip()).str.lower()
movies['Title'] = movies['Title'].apply(lambda x: x.replace("&", "and"))
movies['Title'] = movies['Title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')


This cell preprocesses the loaded data by extracting and normalizing relevant features. It includes steps like extracting the year from the movie titles, normalizing the movie titles to lowercase, and handling missing values in the title.basics.tsv data.

In [50]:
# Normaliser les colonnes de title_basics
title_basics = title_basics[['tconst', 'primaryTitle', 'startYear', 'genres']]
title_basics.rename(columns={'tconst': 'MovieID_IMDB', 'primaryTitle': 'Title', 'startYear': 'Year_IMDB', 'genres': 'Genres_IMDB'}, inplace=True)

# Gérer les valeurs manquantes avant d'appliquer des transformations de chaîne de caractères
title_basics['Title'] = title_basics['Title'].fillna('').str.lower()
title_basics['Title'] = title_basics['Title'].apply(lambda x: x.replace("&", "and"))
title_basics['Title'] = title_basics['Title'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')


In [51]:
title_basics


Unnamed: 0,MovieID_IMDB,Title,Year_IMDB,Genres_IMDB
0,tt0000001,carmencita,1894,"Documentary,Short"
1,tt0000002,le clown et ses chiens,1892,"Animation,Short"
2,tt0000003,pauvre pierrot,1892,"Animation,Comedy,Romance"
3,tt0000004,un bon bock,1892,"Animation,Short"
4,tt0000005,blacksmith scene,1893,"Comedy,Short"
...,...,...,...,...
10906498,tt9916848,episode #3.17,2009,"Action,Drama,Family"
10906499,tt9916850,episode #3.19,2010,"Action,Drama,Family"
10906500,tt9916852,episode #3.20,2010,"Action,Drama,Family"
10906501,tt9916856,the wind,2015,Short


# Data Merging

Merges the movies.dat data with the title.basics.tsv data to create a comprehensive dataset that includes additional metadata like genres and start year from IMDb.

In [72]:
# Fusionner les datasets
merged_movies = pd.merge(movies, title_basics, how='left', on='Title')

# Assurer que les MovieID sont cohérents et garder une seule colonne MovieID
if 'MovieID_x' in merged_movies.columns:
    merged_movies = merged_movies.rename(columns={'MovieID_x': 'MovieID'})
    merged_movies.drop(columns=['MovieID_y'], inplace=True)

# Fusionner les données de notation avec les informations de films enrichies
merged_data = pd.merge(ratings, merged_movies, on='MovieID')
merged_data.drop(columns=['MovieID_IMDB'], inplace=True)
merged_data.drop(columns=['Year_IMDB'], inplace=True)
merged_data.drop(columns=['Genres_IMDB'], inplace=True)



In [61]:
merged_data

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Year
0,1,1193,5,978300760,one flew over the cuckoo's nest,Drama,1975
1,1,1193,5,978300760,one flew over the cuckoo's nest,Drama,1975
2,1,1193,5,978300760,one flew over the cuckoo's nest,Drama,1975
3,1,1193,5,978300760,one flew over the cuckoo's nest,Drama,1975
4,1,1193,5,978300760,one flew over the cuckoo's nest,Drama,1975
...,...,...,...,...,...,...,...
16621137,6040,1097,4,956715569,e.t. the extra-terrestrial,Children's|Drama|Fantasy|Sci-Fi,1982
16621138,6040,1097,4,956715569,e.t. the extra-terrestrial,Children's|Drama|Fantasy|Sci-Fi,1982
16621139,6040,1097,4,956715569,e.t. the extra-terrestrial,Children's|Drama|Fantasy|Sci-Fi,1982
16621140,6040,1097,4,956715569,e.t. the extra-terrestrial,Children's|Drama|Fantasy|Sci-Fi,1982


# Model Training with GridSearchCV
1. -creates a user-item interaction matrix from the merged dataset, which will be used for collaborative filtering.
1. -splits the data into training and testing sets to evaluate the performance of the recommendation model
1. -uses the Surprise library to train an SVD (Singular Value Decomposition) model with hyperparameter tuning using GridSearchCV. It finds the best hyperparameters to optimize the model's performance.

In [63]:
# Utiliser un échantillon plus petit (2% des données)
sampled_data = merged_data.sample(frac=0.02, random_state=42)
train, test = train_test_split(sampled_data, test_size=0.2)

# Supprimer les doublons
sampled_data = sampled_data.drop_duplicates(subset=['UserID', 'MovieID'])

# Créer la matrice utilisateur-élément
user_item_matrix = sampled_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

# Utiliser la bibliothèque Surprise pour la factorisation de matrice
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train[['UserID', 'MovieID', 'Rating']], reader)

# Paramètres très réduits pour accélérer l'exécution
param_grid = {
    'n_factors': [50],
    'n_epochs': [10],
    'lr_all': [0.005],
    'reg_all': [0.1]
}

# Utiliser n_jobs=-1 pour utiliser tous les CPU disponibles
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=2, n_jobs=-1)
gs.fit(data)

# Best SVD model

In [64]:
# Meilleur modèle SVD
best_svd = gs.best_estimator['rmse']
trainset = data.build_full_trainset()
best_svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7855b0ef1e40>

# Evaluation function


In [65]:
def evaluate_model(train, test, model):
    testset = list(zip(test['UserID'], test['MovieID'], test['Rating']))
    predictions = model.test(testset)
    y_true = [pred.r_ui for pred in predictions]
    y_pred = [pred.est for pred in predictions]
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    print(f"RMSE: {rmse}")


#  Evaluate the model
Evaluate the trained SVD model using the RMSE metric on the test data. This step measures how well the model's predictions match the actual ratings.


In [66]:
evaluate_model(train, test, best_svd)


RMSE: 0.8942458528062195


# Recommendation function


defines a function recommend_movie_for_user_pair that takes two user IDs and the trained SVD model to recommend movies based on the average predicted ratings for the two users.

In [None]:
def recommend_movie_for_user_pair(user1, user2, model, user_item_matrix, movies_df, top_n=100):
    all_movie_ids = user_item_matrix.columns
    
    # Prédictions pour tous les films pour les deux utilisateurs
    user1_predictions = [model.predict(user1, movie_id).est for movie_id in all_movie_ids]
    user2_predictions = [model.predict(user2, movie_id).est for movie_id in all_movie_ids]
    
    # Moyenne des prédictions des deux utilisateurs
    average_ratings = np.mean([user1_predictions, user2_predictions], axis=0)
    
    # Obtenir les indices des meilleurs films
    top_movie_indices = np.argsort(-average_ratings)[:top_n]
    
    recommended_movies = [(all_movie_ids[i], average_ratings[i]) for i in top_movie_indices]
    return recommended_movies


In [73]:
def recommend_movies_for_user_pair(user1, user2, model, user_item_matrix, movies_df, top_n=10):
    # Obtenir les notes des utilisateurs
    user1_ratings = user_item_matrix.loc[user1]
    user2_ratings = user_item_matrix.loc[user2]
    
    # Obtenir les genres et les années des films aimés par chaque utilisateur
    liked_genres_user1 = movies_df[movies_df['MovieID'].isin(user1_ratings[user1_ratings > 3].index)]['Genres']
    liked_genres_user2 = movies_df[movies_df['MovieID'].isin(user2_ratings[user2_ratings > 3].index)]['Genres']
    liked_years_user1 = movies_df[movies_df['MovieID'].isin(user1_ratings[user1_ratings > 3].index)]['Year']
    liked_years_user2 = movies_df[movies_df['MovieID'].isin(user2_ratings[user2_ratings > 3].index)]['Year']
    
    # Trouver les genres et les années préférés communs
    preferred_genres = set(liked_genres_user1).intersection(set(liked_genres_user2))
    preferred_years = set(liked_years_user1).intersection(set(liked_years_user2))
    
    # Prédictions pour tous les films pour les deux utilisateurs
    all_movie_ids = user_item_matrix.columns
    movie_predictions_user1 = {movie_id: model.predict(user1, movie_id).est for movie_id in all_movie_ids}
    movie_predictions_user2 = {movie_id: model.predict(user2, movie_id).est for movie_id in all_movie_ids}
    
    # Moyenne des prédictions des deux utilisateurs
    average_ratings = {movie_id: np.mean([movie_predictions_user1[movie_id], movie_predictions_user2[movie_id]]) for movie_id in all_movie_ids}
    
    # Filtrer les films par genres et années préférés
    recommended_movies = [(movie_id, rating) for movie_id, rating in average_ratings.items() 
                          if any(genre in movies_df[movies_df['MovieID'] == movie_id]['Genres'].values[0] for genre in preferred_genres) 
                          and movies_df[movies_df['MovieID'] == movie_id]['Year'].values[0] in preferred_years]
    
    # Trier les films recommandés par score moyen décroissant
    recommended_movies = sorted(recommended_movies, key=lambda x: x[1], reverse=True)
    return recommended_movies[:top_n]


# Testing recommending a movie for a user pair


generates and prints the top 10 recommended movies for a pair of users (e.g., user1 and user2). It sorts the recommendations by the predicted couple score in descending order.

In [76]:
user1 = 3
user2 = 4
recommended_movies = recommend_movie_for_user_pair(user1, user2, best_svd, user_item_matrix, merged_movies)

# Trier les films recommandés par le score du couple du plus grand au plus petit
recommended_movies = sorted(recommended_movies, key=lambda x: x[1], reverse=True)

print(f"Top 10 des films recommandés pour les utilisateurs {user1} et {user2}:")
for movie in recommended_movies[:10]:
    movie_id = movie[0]
    movie_title = merged_movies[merged_movies['MovieID'] == movie_id]['Title'].values[0]
    print(f'Film: {movie_id}, Titre: {movie_title}, Score du couple: {movie[1]}')


Top 10 des films recommandés pour les utilisateurs 3 et 4:
Film: 750, Titre: dr. strangelove or: how i learned to stop worrying and love the bomb, Score du couple: 4.519142344408221
Film: 1204, Titre: lawrence of arabia, Score du couple: 4.509153857787846
Film: 904, Titre: rear window, Score du couple: 4.494725766041729
Film: 527, Titre: schindler's list, Score du couple: 4.493356642022634
Film: 2019, Titre: seven samurai, Score du couple: 4.477420870444577
Film: 1272, Titre: patton, Score du couple: 4.471507113039236
Film: 1260, Titre: m, Score du couple: 4.464729041474941
Film: 260, Titre: star wars: episode iv - a new hope, Score du couple: 4.462769987795189
Film: 908, Titre: north by northwest, Score du couple: 4.451690128652227
Film: 2762, Titre: sixth sense, the, Score du couple: 4.451643421545319


In [75]:
# Recommander des films pour deux utilisateurs donnés
user1 = 1
user2 = 2
recommended_movies = recommend_movies_for_user_pair(user1, user2, best_svd, user_item_matrix, movies)
recommended_movies = sorted(recommended_movies, key=lambda x: x[1], reverse=True)

print(f"Top {len(recommended_movies)} des films recommandés pour les utilisateurs {user1} et {user2}:")
for movie in recommended_movies:
    movie_id = movie[0]
    movie_title = movies[movies['MovieID'] == movie_id]['Title'].values[0]
    print(f'Film: {movie_id}, Titre: {movie_title}, Score: {movie[1]}')

Top 10 des films recommandés pour les utilisateurs 1 et 2:
Film: 1224, Titre: henry v, Score: 4.381034754841094
Film: 1246, Titre: dead poets society, Score: 4.226534539353187
Film: 1242, Titre: glory, Score: 4.220056622750699
Film: 1302, Titre: field of dreams, Score: 4.071498696662916
Film: 3424, Titre: do the right thing, Score: 4.0299862410333525
Film: 1962, Titre: driving miss daisy, Score: 3.9403387221488915
Film: 1185, Titre: my left foot, Score: 3.901927756225144
Film: 3019, Titre: drugstore cowboy, Score: 3.8866240957898963
Film: 1173, Titre: cook the thief his wife and her lover, the, Score: 3.869776437111356
Film: 3521, Titre: mystery train, Score: 3.8317309688401218
