In [1]:
#Création matrice modèle
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

ratings=pd.read_csv('../../../dataset/ml-20m/ratings.csv')
df_ref=pd.read_csv('../../new_dataset/correspondances_Id_movie')

list_movieId=pd.read_csv('../reduced_list_movieId.txt', header=None, names=["movieId_ref"])
list_userId=pd.read_csv('../reduced_list_userId.txt', header=None, names=["userId"])

In [2]:
# Ajout de movieId_ref dans fichier ratings, et suppression infos inutiles
ratings.index=ratings.movieId
df_ref.index=df_ref.movieId
ratings=ratings.join(df_ref.movieId_ref)
ratings=ratings.drop(["timestamp", "movieId"], axis=1)
ratings=ratings.dropna(subset=["movieId_ref"])
print(len(ratings.movieId_ref.unique()))
ratings.movieId_ref=ratings.movieId_ref.astype(int)
ratings.head()

23608


Unnamed: 0_level_0,userId,rating,movieId_ref
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,4.0,1
1,6,5.0,1
1,8,4.0,1
1,10,4.0,1
1,11,4.5,1


In [3]:
liste1=list(list_movieId.movieId_ref)
liste2=list(list_userId.userId)

#Réduction des données avec les movies set Id sélectionnés dans fichier reduction_Dataset
ratings=ratings[(ratings.movieId_ref.isin(liste1))]
ratings=ratings[(ratings.userId.isin(liste2))]
print(ratings.shape)
print(len(ratings.userId.unique()))
print(len(ratings.movieId_ref.unique()))

#Renumérotation des movieId et user Id de manière continue
ratings_user=ratings.groupby("userId").count()
ratings_user["userId_red"]=np.arange(1,22813)
ratings=ratings.join(ratings_user.userId_red, on="userId")
ratings.head(40)

ratings_movie=ratings.groupby("movieId_ref").count()
ratings_movie["movieId_red"]=np.arange(1,3221)
ratings=ratings.join(ratings_movie.movieId_red, on="movieId_ref")
ratings.head(40)

ratings=ratings.sort_values(by='userId_red')
ratings=ratings.drop(["movieId_ref", "userId"], axis=1)

(567692, 3)
22812
3220


In [4]:
ratings.head()
ratings.describe()

Unnamed: 0,rating,userId_red,movieId_red
count,567692.0,567692.0,567692.0
mean,3.499027,11405.873098,660.028549
std,1.054596,6590.331316,672.0728
min,0.5,1.0,1.0
25%,3.0,5694.0,185.0
50%,3.5,11329.0,412.0
75%,4.0,17167.0,900.0
max,5.0,22812.0,3220.0


In [5]:
#séparer jeu de test jeu d entrainement
train_data, test_data=train_test_split(ratings, test_size=0.2)

In [6]:
#Définition du nombre de users et du nombre de films

n_users = ratings.userId_red.unique().shape[0]
n_movies = ratings.movieId_red.unique().shape[0]
print ("Nombre d'utilisateurs : " + str(n_users) + " | Nombre de films : " + str(n_movies))

Nombre d'utilisateurs : 22812 | Nombre de films : 3220


In [7]:
# Création de 2 matrices modèles train et test

matrice_train=np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    matrice_train[line[2]-1, line[3]-1]=line[1]

matrice_test=np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    matrice_test[line[2]-1, line[3]-1]=line[1]

In [8]:
matrice_train.shape

(22812, 3220)

# Filtrage collaboratif :
Ici movie-based filtering
Problème de mémoire! Je ne peux pas le faire

In [9]:
#Création des matrices de similarité
from sklearn.metrics.pairwise import cosine_similarity
movie_similarity=cosine_similarity(matrice_train.T)
movie_similarity.shape

(3220, 3220)

In [10]:
#Calcul des prédictions
def pred_movie(ratings, movie_similarity, k):    
    pred = np.zeros(ratings.shape)
    print(pred.shape)
    for i in tqdm(range(ratings.shape[1])):     
        top_k_movies = np.argsort(movie_similarity[:,i])[-1:-k-1:-1]  
        for u in tqdm(range(ratings.shape[0])):
            pred[u,i]=movie_similarity[i,:][top_k_movies].dot((ratings[u,:][top_k_movies]).T)
            pred[u,i]/=np.sum(np.abs(movie_similarity[i,:][top_k_movies]))+0.000001     
    return pred 


def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten() # Ignore nonzero terms.
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [13]:
#Evaluation du modèle. 

k_array = [5, 15, 30, 50]
movie_test_rmse = []
movie_train_rmse = []
for k in tqdm(k_array):
    movie_pred = pred_movie(matrice_train, movie_similarity, k=k)      
    movie_train_rmse += [get_rmse(movie_pred, matrice_train) ]       
    movie_test_rmse +=  [get_rmse(movie_pred, matrice_test) ]


sns.set()
pal = sns.color_palette("Set1", 4)
plt.figure(figsize=(8, 8))
plt.plot(k_array, movie_train_rmse, c=pal[2], label='Item-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, movie_test_rmse, c=pal[3], label='Item-based test', linewidth=5)
plt.legend(loc='best', fontsize=20)
plt.xlabel('k', fontsize=30);
plt.ylabel('RMSE', fontsize=30);
plt.legend(loc='center left', bbox_to_anchor=(1, 0.8))

#Dure 1heure pour s'executer. 15min par k 




  0%|          | 0/4 [00:00<?, ?it/s][A[A[A



  0%|          | 0/3220 [00:00<?, ?it/s][A[A[A[A




  0%|          | 0/22812 [00:00<?, ?it/s][A[A[A[A[A




 27%|██▋       | 6061/22812 [00:00<00:00, 60605.87it/s][A[A[A[A[A






(22812, 3220)


 55%|█████▍    | 12516/22812 [00:00<00:00, 61735.59it/s][A[A[A[A[A




 89%|████████▉ | 20309/22812 [00:00<00:00, 65839.19it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 67868.01it/s][A[A[A[A[A



  0%|          | 1/3220 [00:00<18:10,  2.95it/s][A[A[A[A




  0%|          | 0/22812 [00:00<?, ?it/s][A[A[A[A[A




 42%|████▏     | 9561/22812 [00:00<00:00, 95604.16it/s][A[A[A[A[A




 86%|████████▌ | 19609/22812 [00:00<00:00, 97005.54it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 95569.70it/s][A[A[A[A[A



  0%|          | 2/3220 [00:00<16:38,  3.22it/s][A[A[A[A




  0%|          | 0/22812 [00:00<?, ?it/s][A[A[A[A[A




 39%|███▉      | 8911/22812 [00:00<00:00, 89102.01it/s][A[A[A[A[A




 78%|███████▊  | 17876/22812 [00:00<00:00, 89254.22it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 88884.78it/s][A[A[A[A[A



  0%|          | 3/3220 [00:00<15:52,  3.38it/s][A[A[A[A




  0%

 44%|████▎     | 9937/22812 [00:00<00:00, 99365.83it/s][A[A[A[A[A




 87%|████████▋ | 19860/22812 [00:00<00:00, 99322.37it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 98194.13it/s][A[A[A[A[A



  1%|          | 24/3220 [00:05<12:48,  4.16it/s][A[A[A[A




  0%|          | 0/22812 [00:00<?, ?it/s][A[A[A[A[A




 40%|████      | 9159/22812 [00:00<00:00, 91580.70it/s][A[A[A[A[A




 83%|████████▎ | 19004/22812 [00:00<00:00, 93538.14it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 95647.85it/s][A[A[A[A[A



  1%|          | 25/3220 [00:06<12:52,  4.14it/s][A[A[A[A




  0%|          | 0/22812 [00:00<?, ?it/s][A[A[A[A[A




 43%|████▎     | 9705/22812 [00:00<00:00, 97041.30it/s][A[A[A[A[A




 89%|████████▊ | 20237/22812 [00:00<00:00, 99384.11it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 100573.15it/s][A[A[A[A[A



  1%|          | 26/3220 [00:06<12:42,  4.19it/s][A[A[A[A




 

 39%|███▉      | 8991/22812 [00:00<00:00, 89908.16it/s][A[A[A[A[A




 81%|████████▏ | 18575/22812 [00:00<00:00, 91608.05it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 93541.63it/s][A[A[A[A[A



  1%|▏         | 47/3220 [00:11<12:51,  4.11it/s][A[A[A[A




  0%|          | 0/22812 [00:00<?, ?it/s][A[A[A[A[A




 44%|████▍     | 10087/22812 [00:00<00:00, 100863.84it/s][A[A[A[A[A




 86%|████████▌ | 19567/22812 [00:00<00:00, 98964.73it/s] [A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 97238.61it/s][A[A[A[A[A



  1%|▏         | 48/3220 [00:11<12:47,  4.13it/s][A[A[A[A




  0%|          | 0/22812 [00:00<?, ?it/s][A[A[A[A[A




 39%|███▊      | 8812/22812 [00:00<00:00, 88114.83it/s][A[A[A[A[A




 81%|████████  | 18424/22812 [00:00<00:00, 90370.93it/s][A[A[A[A[A




100%|██████████| 22812/22812 [00:00<00:00, 91804.30it/s][A[A[A[A[A



  2%|▏         | 49/3220 [00:11<12:59,  4.07it/s][A[A[A[A





KeyboardInterrupt: 