In [37]:
import numpy as np 
import pandas as pd
from scipy.sparse.linalg import svds



In [7]:
ratings = pd.read_csv('./input/ratings_small.csv')
ratings.head()
# Lecture du fichier CSV

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [8]:
user_item = ratings.groupby(['userId', 'movieId'])['rating'].first().unstack(fill_value=0.0)
# Liaison de userId, movieId depuis le rating

In [9]:
user_item.shape

(671, 9066)

In [35]:
user_item.describe

<bound method NDFrame.describe of movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0     4.0     0.0     0.0     0.0     0.0     0.0   
...         ...     ...     ...     ...     ...     ...     ...     ...   
667         0.0     0.0     0.0     0.0     0.0     4.0     0.0     0.0   
668         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
669         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
670         4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
671         5.0     0.0     0.0     0.0     0.0     0.0     0.0   

In [10]:
user_item.loc[42].sort_values(ascending=False).head()
# Trie des valeur pour l'user_item 42

movieId
457     5.0
1196    5.0
318     5.0
1036    5.0
1097    5.0
Name: 42, dtype: float64

In [40]:
U, sigma, Vt = svds(user_item.to_numpy(), k=50)
#U, sigma et VT sont des matrices
#svds décompose partiellement en valeurs singulières d'une matrice creuse.



In [41]:
U.shape

(671, 50)

In [42]:
Vt.shape

(50, 9066)

In [43]:
sigma_diag_matrix=np.diag(sigma)
# La méthode diag sert a créer / extraire une diagonale

In [44]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma_diag_matrix), Vt)
# Ici la variable all_user est un produit scalaire des matrices / diagonale, la methode dot est la pour
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = user_item.columns, index=user_item.index)
# On crée un Dataframe

In [45]:
preds_df.shape

(671, 9066)

In [46]:
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.057419,0.044156,-0.003525,-0.014026,-0.011693,0.037647,-0.011436,-0.012794,0.017412,0.037456,...,-0.001519,0.000171,0.016497,0.00013,0.000195,-0.013686,0.024746,-0.0007,-0.00042,-0.003038
2,0.397099,1.422453,-0.175735,0.166606,0.269432,0.385791,0.068086,0.043673,-0.004759,2.24194,...,-0.002005,0.000998,-0.007596,0.000552,0.000828,-0.059196,-0.011395,0.005804,0.003482,-0.00401
3,1.275403,0.296807,0.033442,-0.010834,0.100854,0.062972,-0.088911,0.009645,-0.002955,0.227495,...,-0.00261,-0.004553,-0.011604,0.000279,0.000419,-0.008031,-0.017406,0.008379,0.005027,-0.005219
4,1.183095,1.016435,0.105598,0.074721,-0.337196,-1.433923,-0.337587,-0.154619,-0.056599,1.551431,...,0.023616,0.008438,0.058714,0.003375,0.005062,0.150868,0.088071,-0.023062,-0.013837,0.047233
5,1.338518,1.510306,0.655975,-0.030616,0.764755,-0.159291,0.009004,0.054437,-0.134904,0.396755,...,0.0017,-0.000225,0.016028,0.006308,0.009463,-0.005235,0.024042,0.013163,0.007898,0.0034


In [47]:
user_item.loc[42].sort_values(ascending=False).head(10)

movieId
457     5.0
1196    5.0
318     5.0
1036    5.0
1097    5.0
1200    5.0
480     5.0
296     5.0
589     5.0
4993    5.0
Name: 42, dtype: float64

In [48]:
movies_user_42 = user_item.loc[42]

In [49]:
high_rated_movies_42 = movies_user_42[movies_user_42 > 3].index

In [50]:
high_rated_movies_42

Int64Index([   110,    165,    260,    296,    318,    349,    356,    380,
               457,    480,    508,    527,    588,    589,    648,    733,
               780,   1036,   1097,   1196,   1198,   1200,   1210,   1291,
              1370,   1527,   1704,   1721,   2028,   2571,   2916,   2985,
              3793,   4886,   4993,   5349,   5952,   7153,   7502,   8636,
              8961,  33794,  40815,  44191,  48394,  48516,  58559,  59315,
             68358,  69844,  70286,  73017,  74458,  76093,  79132,  87232,
             89745,  91529, 106782, 109487, 112852, 122886],
           dtype='int64', name='movieId')

In [51]:
movies_recommended_for_42 = preds_df.loc[42]

In [52]:
movies_high_recommend_for_42 = movies_recommended_for_42[movies_recommended_for_42 > 3].index

In [53]:
movies_high_recommend_for_42

Int64Index([110, 260, 318, 356, 480, 589, 1196, 1198, 1210, 2571, 4993, 5952,
            7153, 58559],
           dtype='int64', name='movieId')

In [54]:
set(movies_high_recommend_for_42) - set(high_rated_movies_42)

set()

In [55]:
def get_high_recommended_movies(userId):
    movies_rated_by_user = user_item.loc[userId]
    movies_high_rated_by_user =  movies_rated_by_user[movies_rated_by_user > 3].index
    movies_recommended_for_user = preds_df.loc[userId]
    movies_high_recommend_for_user = movies_recommended_for_user[movies_recommended_for_user > 3].index
    return set(movies_high_recommend_for_user) - set(movies_high_rated_by_user)

In [56]:
get_high_recommended_movies(42)

set()

In [57]:
get_high_recommended_movies(314)

set()

In [58]:
get_high_recommended_movies(217)

{1198}

In [59]:
preds_df.loc[217, 1198]

4.206618332064976