In [2]:
import pandas as pd
import numpy as np

In [3]:
def convert_matrix(data,user,item,rating):
    matrix = data.pivot(index=user, columns=item, values=rating).as_matrix()
    return matrix   

In [4]:
def get_user_mean(matrix):
    return np.nanmean(matrix, axis = 1)

In [5]:
def centralize_matrix(matrix):
    user_mean = get_user_mean(matrix)
    centered_matrix = matrix - user_mean[:, np.newaxis]
    return centered_matrix

In [6]:
def initialize(matrix,dimension = 1):
    U = np.random.rand(matrix.shape[0],dimension)
    V = np.random.rand(matrix.shape[1],dimension)
    return U,V

In [7]:
def get_position_of_nan(matrix):
    return np.isnan(matrix)

In [21]:
def fill_nan(matrix):
    new_matrix = np.copy(matrix)
    pos = get_position_of_nan(matrix)
    new_matrix[pos] = 0
    return new_matrix,pos

In [9]:
def error_function(matrix,U,V):
    error_matrix =  matrix - U.dot(V.T)
    return error_matrix
    

In [105]:
def gradient_descent(matrix, no_of_iterations = 500, error_tolerance = 0.005, dimension = 2, alpha = 0.001):
    U,V = initialize(matrix,dimension)
    if np.sum(np.isnan(matrix)) > 0:
        matrix, pos = fill_nan(matrix)
    error = 0
    for i in list(range(0,no_of_iterations)):
        error_matrix = error_function(matrix, U, V)
        new_error = 0.5 * np.sum(error_matrix * error_matrix)
        if np.isnan(new_error):
            break
        if np.abs(new_error - error) < error_tolerance:
            break
        print(i,np.abs(new_error - error))
        U_new = U + alpha * error_matrix.dot(V)
        V_new = V + alpha * error_matrix.T.dot(U)
        U = U_new
        V = V_new 
        error = new_error
    return U,V
        

In [11]:
path = '../ml-latest-small/'
filename = 'ratings.csv'

In [12]:
data = pd.read_csv(path+filename)

In [13]:
user_column_name = 'userId'
item_column_name = 'movieId'
rating_column_name = 'rating'

In [106]:
matrix = convert_matrix(data, user_column_name, item_column_name, rating_column_name)
col_idx = np.array(range(0,6000))
matrix = matrix[:, col_idx]
matrix

  


array([[4. , nan, 4. , ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, 4. , 5. ]])

In [107]:
centered_matrix = centralize_matrix(matrix)

In [108]:
matrix.shape

(610, 6000)

In [109]:
rank = np.array([[7,6,7,4,5,4],[6,7,np.nan,4,3,4],[np.nan,3,3,1,1,np.nan]
                        ,[1,2,2,3,3,4],[1,np.nan,1,2,3,3]])
test_matrix = np.array([[1.5,0.5,1.5,-1.5,-0.5,-1.5],[1.2,2.2,np.nan,-0.8,-1.8,-0.8],[np.nan,1,1,-1,-1,np.nan]
                        ,[-1.5,-0.5,-0.5,0.5,0.5,1.5],[-1,np.nan,-1,0,1,1]])

In [110]:
whole_matrix,position = fill_nan(centered_matrix) 

In [111]:
U,V = gradient_descent(whole_matrix, dimension = 2)

0 640546.9921885999
1 1020830.2226733523
2 1046096.3008536304
3 504684.01199803094
4 65655.5810625898
5 4855.33748036476
6 1991.8905292921772
7 1028.0822524528194
8 598.374100330635
9 375.21910159986146
10 247.7888831421369
11 170.38549159980903
12 121.43077131180326
13 89.6566141573494
14 68.70857198008162
15 54.787032869098766
16 45.51835731457686
17 39.37162133305537
18 35.33675985892478
19 32.73675008011196
20 31.112860604800517
21 30.15195938116085
22 29.639194442708686
23 29.42658150925854
24 29.411866836737317
25 29.524175970342185
26 29.714214551233454
27 29.94755510456889
28 30.200030240688648
29 30.454569630623155
30 30.699028851151525
31 30.924700314841175
32 31.12529328276287
33 31.296236245441833
34 31.434200542069448
35 31.53677549056738
36 31.602246960173943
37 31.629446267041203
38 31.617646589234937
39 31.56649122176168
40 31.475942910554295
41 31.34624690477358
42 31.177902718642144
43 30.971641220952733
44 30.728404802226578
45 30.44932914985111
46 30.13572571268014


354 0.03320354011520976
355 0.03337443435884779
356 0.03354646937805228
357 0.033719647297402844
358 0.033893970685312524
359 0.03406944219022989
360 0.03424606478802161
361 0.034423841432726476
362 0.03460277539852541
363 0.03478287019242998
364 0.034964129255968146
365 0.03514655640901765
366 0.03533015567518305
367 0.03551493093254976
368 0.035700886488484684
369 0.035888026766770054
370 0.03607635618391214
371 0.036265879491111264
372 0.036456601294048596
373 0.036648526634962764
374 0.036841660614300054
375 0.03703600819426356
376 0.03723157471540617
377 0.03742836579476716
378 0.03762638662738027
379 0.03782564295397606
380 0.03802614066080423
381 0.038227885343076196
382 0.03843088311259635
383 0.0386351399065461
384 0.038840662033180706
385 0.0390474555460969
386 0.039255526855413336
387 0.039464882444008254
388 0.03967552878020797
389 0.03988747255061753
390 0.04010072027449496
391 0.04031527904589893
392 0.04053115547139896
393 0.04074835663777776
394 0.04096688952995464
395 