# <center> Collaborative Filtering for Movie Recommendations.

In [145]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF


# The MovieLens Dataset

The dataset that we are working with is one of MovieLens datasets, one of the most common and interesting datasets that is available on the internet for building a Recommender System for movie. This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv. More details about the contents and use of all these files follows.

This is a development dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available benchmark datasets if that is your intent.

This and other GroupLens data sets are publicly available for download at http://grouplens.org/datasets/.


In this project we will work on the two files : movies.csv and ratings.csv. Let’s have a look at this two movies dataset :


-   movies.csv dataset has three columns : MovieId, Title and Genre of evry movie. Each line of this file after the header row represents one movie.

In [146]:
# Import movies.csv dataset
data_movies = pd.read_csv('movies.csv')
print(data_movies.shape)
data_movies.head()



(9742, 3)


Unnamed: 0,movieId,movieIditle,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



-   ratings.csv dataset has three columns : userId, movieId and rating. Each line of this file after the header row represents one rating of one movie by one user. Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

In [147]:
# Import ratings.csv dataset
data_ratings = pd.read_csv('ratings.csv')
print(data_ratings.shape)
data_ratings.head()



(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [148]:
# Merging two dataframes "movies.csv" and "ratings.csv"
data = data_ratings.merge(data_movies, how='left', on='movieId',validate='m:1')
print(data.shape)
data



(100836, 6)


Unnamed: 0,userId,movieId,rating,timestamp,movieIditle,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [197]:
min_movie = 100   # movie has to have been rated over 100 times
min_user = 20 # user has to have rated at least 20 times

users = data.groupby('userId')['rating'].count()
users = users.loc[users > min_user].index.values
movies = data.groupby('movieId')['rating'].count()
movies = movies.loc[movies > min_movie].index.values
filtered = data.loc[data.userId.isin(users) & data.movieId.isin(movies)]

print('Unfiltered: ', data.shape[0])
print('Filtered: ', filtered.shape[0])
print('Kept {}% of data'.format(round(filtered.shape[0]/data.shape[0], 2)*100))
filtered


Unfiltered:  100836
Filtered:  19739
Kept 20.0% of data


Unnamed: 0,userId,movieId,rating,timestamp,movieIditle,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
7,1,110,4.0,964982176,Braveheart (1995),Action|Drama|War
...,...,...,...,...,...,...
100217,610,48516,5.0,1479542152,"Departed, The (2006)",Crime|Drama|Thriller
100310,610,58559,4.5,1493844688,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
100326,610,60069,4.5,1493844866,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi
100380,610,68954,3.5,1493844881,Up (2009),Adventure|Animation|Children|Drama


In [337]:
users_movies = filtered.pivot_table(index='userId', columns='movieId', values='rating')
print(users_movies.shape)
users_movies



(590, 134)


movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,,,5.0,5.0,4.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,4.0,4.5,,,4.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,2.0,,,2.0,,,...,,,,,,,,,,
5,4.0,,,,,4.0,3.0,,4.0,4.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,4.0,,,3.0,4.5,3.5,...,4.5,4.0,4.5,4.0,,3.5,,4.0,,
607,4.0,,,,,3.0,,,,5.0,...,,,,,,,,,,
608,2.5,2.0,,4.0,3.5,3.5,3.0,4.5,4.5,4.0,...,4.0,4.5,4.0,4.0,,,,,,
609,3.0,,,4.0,,,,,,3.0,...,,,,,,,,,,


To have our final matrix (users * movies), we replace the missing values (NAN) with zeros.

In [339]:
# We replace the missing values (NAN) with zeros
mat_users_movies = users_movies.replace(np.nan, 0)

# Convert matrix coefficients to integer 
mat_users_movies = mat_users_movies.astype(int)

print(mat_users_movies.shape)

X = mat_users_movies
X

(590, 134)


movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4,0,4,0,0,0,0,5,5,4,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,4,0,0,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,2,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,0,0,0,0,4,3,0,4,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2,0,0,0,4,0,0,3,4,3,...,4,4,4,4,0,3,0,4,0,0
607,4,0,0,0,0,3,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
608,2,2,0,4,3,3,3,4,4,4,...,4,4,4,4,0,0,0,0,0,0
609,3,0,0,4,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0


## Collaborative Filtering using Non-Negative Matrix Factorization (NMF, scikit-learn package)¶ 


### 1. choice of parameters 

In [340]:
def Frobeniusnorm(X):
    
    return np.sqrt(np.sum(np.sum((abs(X)**2))))

import random

def losevalues(X,prop=1/2):
    enleve=round(np.count_nonzero(X)*prop)
    Y=np.ravel(X)
    i=np.random.choice(np.where(Y!=0)[0],replace=False,size=enleve)
    Y[i]=0
    return Y.reshape(X.shape[0],X.shape[1]),i

def difference(X,Y,i):
    
    count=0
    for j in (i):
        if abs(np.ravel(X)[j]-np.ravel(Y)[j])<1:
            count+=1
    return(count/i.size)

def evaluation(X,prop,**args):
    Xl,i=losevalues(X,prop)
    from sklearn.decomposition import NMF
    model = NMF(args)
    W = model.fit_transform(Xl)
    H = model.components_
    Y=W@H
    difference(X,Y,i)

In [341]:
param = {
        'n_components' : [15, 10, 25, 30],
        'init' : ['random'],
        'solver' : ['cd', 'mu'], 
        #'beta_loss' : ['frobenius', 'kullback-leibler'],
        'alpha' : [0.001, 0.01, 0.1],
        'l1_ratio' : [0, 1], 
        'max_iter' : [15, 10, 25]
        }

estimator = NMF(**param)

In [342]:
# Keep track of RMSE and parameters
parameters_search = pd.DataFrame([[0, 0, 0, 0, 0, 0]])
parameters_search.columns = ['n_components', 'alpha', 'solver', 'l1_ratio', 'max_iter'] + ['ressemblance']

In [343]:

i = 0

for n_components in param['n_components']:
    for alpha in param['alpha']:
        for solver in param['solver']:
            #for beta_loss in param['beta_loss'] :
                for l1_ratio in param['l1_ratio']:
                    for max_iter in param['max_iter']:

                        err = 0

                        # updating the parameters
                        parametersNMF = {
                        'n_components' : n_components,
                        'init' : 'random',
                        'solver' : solver,
                        #'beta_loss' : beta_loss,
                        'random_state' : 0, 
                        'alpha' : alpha,
                        'l1_ratio' : l1_ratio,
                        'max_iter' : max_iter
                        }
                        
                        Xl,k=losevalues(X,0.1)
                        from sklearn.decomposition import NMF
                        model = NMF(**parametersNMF)
                        W = model.fit_transform(Xl)
                        H = model.components_
                        Y=W@H
                            
    
                        # Computing the error 
                        XX = np.asarray(X)
                        err = difference(XX,Y,k)
    
                        parameters_search.loc[i] = [n_components, alpha, solver, l1_ratio, max_iter, err ]
                        i += 1




In [311]:

best_params = parameters_search.sort_values('ressemblance', ascending=False )[:1]
print ('*** best params ***')
print (best_params)
parameters_search.sort_values('ressemblance' , ascending=False)

*** best params ***
    n_components  alpha solver  l1_ratio  max_iter  ressemblance
41            10  0.001     cd         1        25      0.231005


Unnamed: 0,n_components,alpha,solver,l1_ratio,max_iter,ressemblance
41,10,0.001,cd,1,25,0.231005
14,15,0.010,cd,0,25,0.228965
53,10,0.010,cd,1,25,0.225905
37,10,0.001,cd,0,10,0.224375
25,15,0.100,cd,0,10,0.223355
...,...,...,...,...,...,...
137,30,0.100,cd,1,25,0.152983
116,30,0.001,mu,0,25,0.152473
130,30,0.010,mu,1,10,0.151963
22,15,0.010,mu,1,10,0.150943


### 2. Users x movies matrix factorization

In [312]:
parametersNMF = {
        'n_components' : 10,
        'init' : 'random',
        'solver' : 'cd',
        'alpha' : 0.001,
        'l1_ratio' : 0,
        'max_iter' : 10
}

In [358]:
nmf_model = NMF(**parametersNMF)     

# Matrix factorization            # X ~ W.H  
nmf_model.fit(X)                     
W = nmf_model.transform(X)        #  W
H = nmf_model.components_         #  H

# Making the predictions
X_pred = W @ H             
                    

print('movie features - H:', H.shape)
print ('User features - W:', W.shape)
print ('R ~ X * H:')
print (X_pred.shape)


movie features - H: (30, 134)
User features - W: (590, 30)
R ~ X * H:
(590, 134)




In [359]:

X_pred = pd.DataFrame(X_pred.round(0))

X_pred.columns = X.columns
X_pred.index = X.index

X_pred[X_pred > 5] = 5.                      
X_pred[X_pred < 1] = 1.           

X_pred

movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,1.0,2.0,2.0,2.0,2.0,1.0,4.0,3.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,2.0,1.0,1.0,1.0,3.0,2.0,2.0,1.0,3.0,3.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.0,1.0,2.0,1.0,3.0,1.0,1.0,4.0,4.0,3.0,...,4.0,3.0,4.0,3.0,3.0,3.0,2.0,1.0,1.0,2.0
607,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
608,3.0,2.0,2.0,1.0,4.0,2.0,2.0,4.0,5.0,4.0,...,3.0,3.0,4.0,4.0,3.0,2.0,2.0,1.0,1.0,1.0
609,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,3.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 3. Making movies recommendations

In [369]:
def make_recommendation_activeuser(X, prediction, userId, k=3):
    
    # get the list of actual ratings of userId (seen movies)
    rated_items_df_user = pd.DataFrame(X).iloc[userId, :]            

    # get the list of predicted ratings of userId (unseen movies)     
    user_prediction_df_user = pd.DataFrame(prediction).iloc[userId,:]

    # merge both lists with the movie's title
    reco_df = pd.concat([rated_items_df_user, user_prediction_df_user, data_movies.movieIditle] , axis=1)
    reco_df.columns = ['rating','prediction','title']

    reco_df = reco_df[ reco_df['rating'] == 0 ]

    return reco_df
    
    



U19 = make_recommendation_activeuser(X, X_pred.round(0), userId=19, k=1)
U137 = make_recommendation_activeuser(X, X_pred.round(0), userId=137, k=1)
U587 = make_recommendation_activeuser(X, X_pred.round(0), userId=587, k=1)

# returns the 2 unseen movies with the best predicted ratings

print ('Recommended movies for the user id', 19)
print (U19.sort_values(by='prediction', ascending=False)[:2])  

print ('Recommended movies for the user id', 137)
print (U137.sort_values(by='prediction', ascending=False)[:2])  

print ('Recommended movies for the user id', 587)
print (U587.sort_values(by='prediction', ascending=False)[:2])  

Recommended movies for the user id 19
      rating  prediction              title
7153     0.0         3.0  Surrogates (2009)
597      0.0         3.0     Thinner (1996)
Recommended movies for the user id 137
     rating  prediction              title
589     0.0         4.0  Last Dance (1996)
592     0.0         3.0   Rock, The (1996)
Recommended movies for the user id 587
      rating  prediction                         title
1197     0.0         3.0  In the Company of Men (1997)
1198     0.0         3.0           Career Girls (1997)


In [379]:
Movie_for_user = pd.DataFrame([[0,0,0]])
Movie_for_user.columns = ['userId','movieId','title']
  
for k in list(range(1,X.index.shape[0])):
    
  A = make_recommendation_activeuser(X, X_pred.round(0), userId=k )

  B = A.sort_values(by='prediction', ascending=False)[:1]
  Movie_for_user.loc[k-1] = [k, B.index.values[0], B['title'].values[0]]

In [380]:
Movie_for_user


Unnamed: 0,userId,movieId,title
0,1.0,60069.0,
1,2.0,1.0,Jumanji (1995)
2,3.0,1097.0,Zeus and Roxanne (1997)
3,4.0,32.0,Babe (1995)
4,5.0,648.0,Chain Reaction (1996)
...,...,...,...
584,585.0,608.0,Heavy (1995)
585,586.0,1198.0,Career Girls (1997)
586,587.0,1197.0,In the Company of Men (1997)
587,588.0,165.0,Something to Talk About (1995)
