In [6]:
#importing the required libraries
import numpy as np
import pandas as pd
import pickle
from models import matrix_factorization_utilities
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    #select prediction values that are non-zero and flatten into 1 array
    prediction = prediction[ground_truth.nonzero()].flatten() 
    #select test values that are non-zero and flatten into 1 array
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    #return RMSE between values
    return sqrt(mean_squared_error(prediction, ground_truth))

In [11]:
# Reading the ratings data
ratings = pd.read_csv('Dataset/ratings.csv')

In [12]:
#Just taking the required columns
ratings = ratings[['userId', 'movieId','rating']]

In [32]:
#reading the movies dataset
movie_list = pd.read_csv('Dataset/movies.csv')

In [13]:
#get ordered list of movieIds
item_indices = pd.DataFrame(sorted(list(set(ratings['movieId']))),columns=['movieId'])
#add in data frame index value to data frame
item_indices['movie_index']=item_indices.index
#inspect data frame
item_indices.head()

Unnamed: 0,movieId,movie_index
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [14]:
#get ordered list of movieIds
user_indices = pd.DataFrame(sorted(list(set(ratings['userId']))),columns=['userId'])
#add in data frame index value to data frame
user_indices['user_index']=user_indices.index
#inspect data frame
user_indices.head()

Unnamed: 0,userId,user_index
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [15]:
#join the movie indices
df_with_index = pd.merge(ratings,item_indices,on='movieId')
#join the user indices
df_with_index=pd.merge(df_with_index,user_indices,on='userId')
#inspec the data frame
df_with_index.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
0,1,2,3.5,1,0
1,1,29,3.5,28,0
2,1,32,3.5,31,0
3,1,47,3.5,46,0
4,1,50,3.5,49,0


In [16]:
#import train_test_split module
from sklearn.model_selection import train_test_split
#take 80% as the training set and 20% as the test set
df_train, df_test= train_test_split(df_with_index,test_size=0.2)
print(len(df_train))
print(len(df_test))

838860
209715


In [17]:
df_train.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
444011,4010,2140,2.0,2000,4009
985620,2074,1126,3.0,1078,2073
849242,6538,2160,3.0,2020,6537
112688,2051,2011,4.0,1871,2050
247632,4295,158,3.0,153,4294


In [18]:
df_test.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
872542,5844,1222,5.0,1167,5843
394267,7051,2565,4.0,2416,7050
960074,5378,318,3.0,312,5377
833786,4745,6296,4.5,5929,4744
447184,4222,60161,4.0,10459,4221


In [20]:
n_users = ratings.userId.unique().shape[0]
n_items = ratings.movieId.unique().shape[0]
n_users, n_items

(7120, 14026)

In [21]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
    #for every line in the data
for line in df_train.itertuples():
    #set the value in the column and row to 
    #line[1] is userId, line[2] is movieId and line[3] is rating, line[4] is movie_index and line[5] is user_index
    train_data_matrix[line[5], line[4]] = line[3]
train_data_matrix.shape

(7120, 14026)

In [22]:
#Create two user-item matrices, one for training and another for testing
test_data_matrix = np.zeros((n_users, n_items))
    #for every line in the data
for line in df_test[:1].itertuples():
    #set the value in the column and row to 
    #line[1] is userId, line[2] is movieId and line[3] is rating, line[4] is movie_index and line[5] is user_index
    #print(line[2])
    test_data_matrix[line[5], line[4]] = line[3]
    #train_data_matrix[line['movieId'], line['userId']] = line['rating']
test_data_matrix.shape

(7120, 14026)

In [23]:
pd.DataFrame(train_data_matrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14016,14017,14018,14019,14020,14021,14022,14023,14024,14025
0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df_train['rating'].max()

5.0

In [30]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    #select prediction values that are non-zero and flatten into 1 array
    prediction = prediction[ground_truth.nonzero()].flatten() 
    #select test values that are non-zero and flatten into 1 array
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    #return RMSE between values
    return sqrt(mean_squared_error(prediction, ground_truth))

In [31]:
#Calculate the rmse sscore of SVD using different values of k (latent features)
rmse_list = []
for i in [1,2,5,20,40,60,100,200]:
    #apply svd to the test data
    u,s,vt = svds(train_data_matrix,k=i)
    #get diagonal matrix
    s_diag_matrix=np.diag(s)
    #predict x with dot product of u s_diag and vt
    X_pred = np.dot(np.dot(u,s_diag_matrix),vt)
    #calculate rmse score of matrix factorisation predictions
    rmse_score = rmse(X_pred,test_data_matrix)
    rmse_list.append(rmse_score)
    print("Matrix Factorisation with " + str(i) +" latent features has a RMSE of " + str(rmse_score))

Matrix Factorisation with 1 latent features has a RMSE of 4.669539459205359
Matrix Factorisation with 2 latent features has a RMSE of 4.6653512254462415
Matrix Factorisation with 5 latent features has a RMSE of 4.493305172874121
Matrix Factorisation with 20 latent features has a RMSE of 4.570271840083156
Matrix Factorisation with 40 latent features has a RMSE of 4.288177669136882
Matrix Factorisation with 60 latent features has a RMSE of 3.791420300303799
Matrix Factorisation with 100 latent features has a RMSE of 4.2092413549728125
Matrix Factorisation with 200 latent features has a RMSE of 3.951941269445904


In [33]:
#Convert predictions to a DataFrame
mf_pred = pd.DataFrame(X_pred)
mf_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14016,14017,14018,14019,14020,14021,14022,14023,14024,14025
0,-0.601059,2.63117,-0.537755,-0.088776,-0.076192,0.055855,-0.028535,-0.08162,-0.117304,-0.429255,...,-0.007231,-0.001994,0.004626,-0.000287,-0.013336,0.002568,-0.038203,0.00131,0.018335,-0.040008
1,0.083206,0.077521,0.475671,0.12485,-0.057103,0.207314,0.126763,0.063721,0.061729,0.129535,...,0.006368,0.00649,-0.007992,0.001102,-0.003908,-0.000977,0.004338,0.007194,-0.017782,-0.011723
2,-0.24074,0.109592,0.209537,-0.126255,-0.232914,0.276836,0.42093,-0.071511,-0.025971,-0.776823,...,0.010084,-0.005037,0.017133,-0.000319,-0.003982,0.004882,0.019185,-0.003618,0.027944,-0.011945
3,-0.175118,0.181085,0.335878,-0.038408,0.231401,1.673522,-0.235199,0.022577,0.087149,2.221013,...,0.001793,-0.002718,-0.001312,0.000194,-0.002539,0.003536,-0.004002,0.002441,0.007841,-0.007616
4,0.409409,1.289695,0.636893,-0.133651,0.524818,0.880327,1.038515,0.07829,0.289993,0.384463,...,-0.013554,0.012307,-0.007743,0.000298,-0.002213,-0.00089,0.013485,-0.009639,0.013161,-0.006639


In [34]:
df_names = pd.merge(ratings,movie_list,on='movieId')
df_names.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy


In [35]:
#choose a user ID
user_id = 1
#get movies rated by this user id
users_movies = df_names.loc[df_names["userId"]==user_id]
#print how many ratings user has made 
print("User ID : " + str(user_id) + " has already rated " + str(len(users_movies)) + " movies")
#list movies that have been rated
users_movies

User ID : 1 has already rated 175 movies


Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1155,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
1603,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3915,1,47,3.5,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
6156,1,50,3.5,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...
130066,1,8507,5.0,Freaks (1932),Crime|Drama|Horror
130115,1,8636,4.5,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
130973,1,8690,3.5,Slaughterhouse-Five (1972),Comedy|Drama|Sci-Fi|War
131002,1,8961,4.0,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy


In [36]:
user_index = df_train.loc[df_train["userId"]==user_id]['user_index'][:1].values[0]
#get movie ratings predicted for this user and sort by highest rating prediction
sorted_user_predictions = pd.DataFrame(mf_pred.iloc[user_index].sort_values(ascending=False))
#rename the columns
sorted_user_predictions.columns=['ratings']
#save the index values as movie id
sorted_user_predictions['movieId']=sorted_user_predictions.index
print("Top 10 predictions for User " + str(user_id))
#display the top 10 predictions for this user
pd.merge(sorted_user_predictions,movie_list, on = 'movieId')[:10]

Top 10 predictions for User 1


Unnamed: 0,ratings,movieId,title,genres
0,5.812224,5622,Charly (2002),Comedy|Drama|Romance
1,5.25839,6699,Once Upon a Time in the Midlands (2002),Drama
2,5.201336,1052,"Proprietor, The (1996)",Drama
3,4.617228,4738,Happy Accidents (2000),Romance|Sci-Fi
4,4.324602,254,Jefferson in Paris (1995),Drama
5,4.222204,1144,"Line King: The Al Hirschfeld Story, The (1996)",Documentary
6,4.188636,312,Stuart Saves His Family (1995),Comedy
7,4.182615,1201,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western
8,4.13567,3773,House Party (1990),Comedy
9,4.122634,1146,Curtis's Charm (1995),Comedy|Drama


In [37]:
#count number of unique users
numUsers = df_train.userId.unique().shape[0]
#count number of unitque movies
numMovies = df_train.movieId.unique().shape[0]
print(len(df_train))
print(numUsers) 
print(numMovies) 

838860
7120
13436


In [38]:
#Separate out the values of the df_train data set into separate variables
Users = df_train['userId'].values
Movies = df_train['movieId'].values
Ratings = df_train['rating'].values
print(Users),print(len(Users))
print(Movies),print(len(Movies))
print(Ratings),print(len(Ratings))

[4010 2074 6538 ... 5312 6446 3768]
838860
[2140 1126 2160 ...  921  203  955]
838860
[2. 3. 3. ... 3. 3. 5.]
838860


(None, None)

In [40]:
#import libraries
import keras
from keras.layers import Embedding, Reshape, concatenate
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [41]:
# Couting no of unique users and movies
len(ratings.userId.unique()), len(ratings.movieId.unique())

(7120, 14026)

In [42]:
# Assigning a unique value to each user and movie in range 0,no_of_users and 0,no_of_movies respectively.
ratings.userId = ratings.userId.astype('category').cat.codes.values
ratings.movieId = ratings.movieId.astype('category').cat.codes.values

In [43]:
# Splitting the data into train and test.
train, test = train_test_split(ratings, test_size=0.2)

In [45]:
train.head

<bound method NDFrame.head of          userId  movieId  rating
176221     1190     1879     4.0
1016384    6880     6385     3.5
430586     2927    11411     3.0
712568     4746      373     5.0
342531     2319     5307     2.5
...         ...      ...     ...
716818     4770      580     3.0
1038673    7039     6940     5.0
117804      786     2444     4.0
323413     2208     2913     2.5
440064     2989      338     4.0

[838860 rows x 3 columns]>

In [46]:
test.head

<bound method NDFrame.head of         userId  movieId  rating
370522    2511      337     4.0
755659    5035     8642     2.5
32080      250      962     4.0
516534    3491     6147     4.5
59511      421     7111     4.0
...        ...      ...     ...
647074    4312     1318     4.0
547760    3672     1780     3.0
517004    3497    12029     2.5
108602     740     8546     4.5
539133    3628     6145     2.5

[209715 rows x 3 columns]>

In [47]:
n_users, n_movies = len(ratings.userId.unique()), len(ratings.movieId.unique())

In [111]:
# Returns a neural network model which does recommendation
#def neural_network_model(n_latent_factors_user, n_latent_factors_movie):
    
movie_input = keras.layers.Input(shape=[1],name='Item')
movie_embedding = keras.layers.Embedding(n_movies + 1, 13, name='Movie-Embedding')(movie_input)
# 13 yerinen_latent_factors_movie
movie_vec = keras.layers.Flatten(name='FlattenMovies')(movie_embedding)
movie_vec = keras.layers.Dropout(0.2)(movie_vec)


user_input = keras.layers.Input(shape=[1],name='User')
user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(n_users + 1, 10,name='User-Embedding')(user_input))
#10 yerine  n_latent_factors_user
user_vec = keras.layers.Dropout(0.2)(user_vec)


concat = keras.layers.concatenate([movie_vec, user_vec],name='Concat')
concat_dropout = keras.layers.Dropout(0.2)(concat)
dense = keras.layers.Dense(100,name='FullyConnected')(concat)
dropout_1 = keras.layers.Dropout(0.2,name='Dropout')(dense)
dense_2 = keras.layers.Dense(50,name='FullyConnected-1')(concat)
dropout_2 = keras.layers.Dropout(0.2,name='Dropout')(dense_2)
dense_3 = keras.layers.Dense(20,name='FullyConnected-2')(dense_2)
dropout_3 = keras.layers.Dropout(0.2,name='Dropout')(dense_3)
dense_4 = keras.layers.Dense(10,name='FullyConnected-3', activation='relu')(dense_3)


result = keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)
adam = Adam(lr=0.005)
model = keras.Model([user_input, movie_input], result)
model.compile(optimizer=adam,loss= 'mean_absolute_error')
    
history_neural_network = model.fit([train.userId, train.movieId], train.rating, epochs=5, verbose=0)    
    
    
    
    #return model

In [112]:
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
User (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
Movie-Embedding (Embedding)     (None, 1, 13)        182351      Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 10)        71210       User[0][0]                       
____________________________________________________________________________________________

In [115]:
y_hat = np.round(model.predict([test.userId, test.movieId]),0)
y_true = test.rating

In [116]:
mean_absolute_error(y_true, y_hat)

0.6584602913477815