In [1]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import turicreate as tc


  from ._conv import register_converters as _register_converters


In [2]:
# Users
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep="|", names=u_cols, encoding="latin-1")

# Ratings
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, encoding='latin-1')

# Items
i_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL','unknown','Action',
          'Adventure','Animation','Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


items = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

In [3]:
print(users.shape)
users.tail()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
938,939,26,F,student,33319
939,940,32,M,administrator,2215
940,941,20,M,student,97229
941,942,48,F,librarian,78209
942,943,22,M,student,77841


In [4]:
print(ratings.shape)
ratings.tail()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156
99999,12,203,3,879959583


In [5]:
print(items.shape)
items.tail()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# use pregenerated train test file by GroupLens

r_cols =['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape,ratings_test.shape

((90570, 4), (9430, 4))

In [7]:
ratings_train.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [8]:
#Building collaborative filtering


n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
n_users,n_items

(943, 1682)

In [10]:
# matrix of m*n
# m = user id
# n = movie id
# values are line[3] ie ratins
data_matrix = np.zeros((n_users,n_items))
for line in ratings.itertuples():
    #     line[1] (row)- user id
    #     line[2] (column)- movie id
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [11]:
data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [12]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine') #Transpose it to get the itemwise similarity

In [17]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        # user - user similarity
        mean_user_rating = ratings.mean(axis=1)
        # Using np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
        
    elif type == 'item':
        # item -item similarity
        pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
        
    return pred
        

In [18]:
user_prediction = predict(data_matrix, user_similarity, type='user')

In [19]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [20]:
# using turicreate instead of pandas

In [21]:
train_data = tc.SFrame(ratings_train)


In [22]:
train_data

user_id,movie_id,rating,unix_timestamp
1,1,5,874965758
1,2,3,876893171
1,3,4,878542960
1,4,3,876893119
1,5,3,889751712
1,6,5,887431973
1,7,4,875071561
1,8,1,875072484
1,9,5,878543541
1,10,3,875693118


In [23]:
test_data = tc.SFrame(ratings_test)
test_data

user_id,movie_id,rating,unix_timestamp
1,20,4,887431883
1,33,4,878542699
1,61,4,878542420
1,117,3,874965739
1,155,2,878542201
1,160,4,875072547
1,171,5,889751711
1,189,3,888732928
1,202,5,875072442
1,265,4,878542441


In [24]:
popularity_model = tc.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

In [25]:
popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5], k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1599   |  5.0  |  1   |
|    1    |   1201   |  5.0  |  2   |
|    1    |   1189   |  5.0  |  3   |
|    1    |   1122   |  5.0  |  4   |
|    1    |   814    |  5.0  |  5   |
|    2    |   1599   |  5.0  |  1   |
|    2    |   1201   |  5.0  |  2   |
|    2    |   1189   |  5.0  |  3   |
|    2    |   1122   |  5.0  |  4   |
|    2    |   814    |  5.0  |  5   |
|    3    |   1599   |  5.0  |  1   |
|    3    |   1201   |  5.0  |  2   |
|    3    |   1189   |  5.0  |  3   |
|    3    |   1122   |  5.0  |  4   |
|    3    |   814    |  5.0  |  5   |
|    4    |   1599   |  5.0  |  1   |
|    4    |   1201   |  5.0  |  2   |
|    4    |   1189   |  5.0  |  3   |
|    4    |   1122   |  5.0  |  4   |
|    4    |   814    |  5.0  |  5   |
|    5    |   1599   |  5.0  |  1   |
|    5    |   1201   |  5.0  |  2   |
|    5    |   1189   |  5.0  |  3   |
|    5    | 

In [26]:
# Recommend items for first 5 users


#train model
item_sim_model = tc.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')

In [27]:
#Making recommendations 
item_similarity_recommender = item_sim_model.recommend(users=[1,2,3,4,5], k=5)

In [28]:
item_similarity_recommender.print_rows(num_rows=25)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   423    | 0.9834008066708805 |  1   |
|    1    |   202    | 0.9495907992352056 |  2   |
|    1    |   655    | 0.7962183331260244 |  3   |
|    1    |   403    | 0.765623665037956  |  4   |
|    1    |   568    | 0.7511795292828829 |  5   |
|    2    |    50    | 1.1256258487701416 |  1   |
|    2    |   181    | 1.0651773168490484 |  2   |
|    2    |    7     | 0.9998190838557023 |  3   |
|    2    |   121    |  0.94162796323116  |  4   |
|    2    |    9     | 0.831989913032605  |  5   |
|    3    |   313    | 0.6353766620159149 |  1   |
|    3    |   328    | 0.6032880300825293 |  2   |
|    3    |   315    | 0.5422587123784152 |  3   |
|    3    |   331    | 0.5355071858926252 |  4   |
|    3    |   332    | 0.5316696112806146 |  5   |
|    4    |    50    | 1.1311477082116264 |  1   |
|    4    |   288    | 1.048715

In [43]:
# We need the matrix factorization to observe how the user rates a movie(ie. latent features) -- Recommendation Engine

In [47]:
'''
Here,
Rating matrix = R(MxN) = PxQT

Where..
P(MxK)
Q(MxK)
QT = Transpose(Q)

'''



'\nHere,\nRating matrix = R(MxN) = PxQT\n\nWhere..\nP(MxK)\nQ(MxK)\nQT = Transpose(Q)\n\n'

In [48]:
# Class for Matrix Factorization

class MF():
    def __init__(self, R, K, alpha, beta, iterations):
        # initialize the user move rating matrix
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        
    
    def train(self):
        
        #Initialize user-feature and movie feature
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K) )
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the bias
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        
        # Training samples
        self.samples = [
            (i,j,self.R[i,j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i,j] > 0
        ]
        
        
        # Gradient descent for given iterations 
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
        
            self.sgd() # stochastic gradient decent
            mse = self.mse() # mean squared error
            training_process.append((i, mse))
            if(i+1) % 20 == 0:
                print("\n Iteration:")
                print(i+1)
                
                print("\n mse")
                print(mse)
                #print("Iteration: %d; error= %.4f%" % (i+1, mse))
                
        
        
        return training_process
    
    # Get mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x,y], 2)
            
        return np.sqrt(error)
    
    
    # Stochastic gradient decent to get optimized P and Q
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])
            
    
    # Ratings for user i and movie j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] +self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    
    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:, np.newaxis] + mf.b_i[np.newaxis:, ] + mf.P.dot(mf.Q.T)
    
    
        


In [49]:
ratings.head() #check how the data looks like

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [50]:
# using pivot table to make it look like a matrix
R = np.array(ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0))


In [51]:
R # see the matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [52]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100)


In [None]:
train_process = mf.train()


 Iteration:
20

 mse
296.1147909414822

 Iteration:
40

 mse
291.0482608658103

 Iteration:
60

 mse
287.6250073682274

 Iteration:
80

 mse
282.09242548989397


In [44]:
# PxQ -- Recommendation Engine
print(mf.full_matrix()) 

[[3.89642814 3.36381112 3.27616509 ... 3.50912249 3.53220987 3.51347355]
 [3.91985197 3.41509531 3.33266871 ... 3.5553829  3.56718258 3.55602933]
 [3.61983878 3.07924453 2.99101932 ... 3.2108813  3.24623585 3.23826523]
 ...
 [3.97947014 3.44801497 3.37388757 ... 3.58815886 3.60390499 3.6224218 ]
 [4.18332615 3.6563274  3.58974809 ... 3.8167104  3.83465766 3.82622831]
 [3.76301483 3.23503254 3.15399298 ... 3.37971476 3.3990444  3.39266078]]
