In [2]:
import pandas as pd
import numpy as np

In [3]:
def convert_matrix(data,user,item,rating):
    matrix = data.pivot(index=user, columns=item, values=rating).as_matrix()
    return matrix   

In [4]:
def get_user_mean(matrix):
    return np.nanmean(matrix, axis = 1)

In [5]:
def centralize_matrix(matrix):
    user_mean = get_user_mean(matrix)
    centered_matrix = matrix - user_mean[:, np.newaxis]
    return centered_matrix

In [6]:
def initialize(matrix,dimension = 1):
    U = np.random.rand(matrix.shape[0],dimension)
    V = np.random.rand(matrix.shape[1],dimension)
    return U,V

In [7]:
def get_position_of_nan(matrix):
    return np.isnan(matrix)

In [8]:
def fill_nan(matrix):
    new_matrix = np.copy(matrix)
    pos = get_position_of_nan(matrix)
    new_matrix[pos] = 0
    return new_matrix,pos

In [9]:
def error_function(matrix,U,V):
    error_matrix =  matrix - U.dot(V.T)
    return error_matrix
    

In [10]:
def gradient_descent(matrix, no_of_iterations = 500, error_tolerance = 0.005, dimension = 2, alpha = 0.001):
    U,V = initialize(matrix,dimension)
    if np.sum(np.isnan(matrix)) > 0:
        matrix, pos = fill_nan(matrix)
    error = 0
    for i in list(range(0,no_of_iterations)):
        error_matrix = error_function(matrix, U, V)
        new_error = 0.5 * np.sum(error_matrix * error_matrix)
        if np.isnan(new_error):
            break
        if np.abs(new_error - error) < error_tolerance:
            break
        print(i,np.abs(new_error - error))
        U_new = U + alpha * error_matrix.dot(V)
        V_new = V + alpha * error_matrix.T.dot(U)
        U = U_new
        V = V_new 
        error = new_error
    return U,V
        

In [22]:
path = '../ml-latest-small/'
filename = 'ratings.csv'

In [23]:
data = pd.read_csv(path+filename)

In [17]:
ratings_small = np.genfromtxt(path+filename,delimiter=',',skip_header =1)[:,0:3]

In [18]:
matrix_factorization(ratings_small,0.0002,0.02,500)

382196.7036946356
213453.0749401013
143950.94015734253
112294.3542730754
94226.89693537382
82436.78467256931
74156.34734419479
68043.95842266476
63357.64021560966
59654.73114948764
56656.695915547214
54180.59450664646
52101.76949190013
50332.489364226
48809.22690062378
47484.81491680126
46323.4566772614
45297.460218440814
44385.04374757406
43568.82423267725
42834.75263196566
42171.347972988675
41569.13582190768
41020.22944623658
40518.01252878984
40056.8954476324
39632.12572397335
39239.63895092322
38875.94038380659
38538.0100381811
38223.226009280756
37929.30205569014
37654.23644981689
37396.269800223476
37153.850071526016
36925.60341787067
36710.30974167878
36506.88211550189
36314.349379319756
36131.84136133025
35958.57627661939
35793.84994198714
35637.02651179203
35487.530492850026
35344.83983910535
35208.479961235586
35078.01851427836
34953.06084911194
34833.24603223111
34718.24335354143
34607.74925449896
34501.484619358875
34399.19238096129
34300.63539971449
34205.59458047818
3411

array([[4.74435472, 4.18095175, 3.81881624, ..., 3.02181078, 3.66662267,
        4.15704176],
       [3.88482786, 3.44483387, 3.170089  , ..., 2.45712482, 3.02321996,
        3.34147828],
       [2.43440078, 2.36560516, 2.53055132, ..., 1.4511603 , 1.54349878,
        1.74090108],
       ...,
       [3.61366922, 3.26284859, 3.1179503 , ..., 2.27021216, 2.64522344,
        3.03944805],
       [3.51982684, 3.15274765, 2.93461836, ..., 2.19987647, 2.77635226,
        2.93226309],
       [4.21829192, 3.75984954, 3.49136489, ..., 2.65875071, 3.25712757,
        3.59207122]])

In [24]:
user_column_name = 'userId'
item_column_name = 'movieId'
rating_column_name = 'rating'

In [31]:
matrix = convert_matrix(data, user_column_name, item_column_name, rating_column_name)
# col_idx = np.array(range(0,6000))
# matrix = matrix[:, col_idx]
# matrix

  


In [28]:
rank = np.array([[7,6,7,4,5,4],[6,7,np.nan,4,3,4],[np.nan,3,3,1,1,np.nan]
                        ,[1,2,2,3,3,4],[1,np.nan,1,2,3,3]])
test_matrix = np.array([[1.5,0.5,1.5,-1.5,-0.5,-1.5],[1.2,2.2,np.nan,-0.8,-1.8,-0.8],[np.nan,1,1,-1,-1,np.nan]
                        ,[-1.5,-0.5,-0.5,0.5,0.5,1.5],[-1,np.nan,-1,0,1,1]])

In [32]:
whole_matrix,position = fill_nan(matrix) 

In [33]:
U,V = gradient_descent(whole_matrix, dimension = 3, alpha = 0.0002)

0 2493057.842828706
1 1261114.2478540754
2 519602.26625965885
3 56148.13388360466
4 7273.553914629272
5 3330.5647757168626
6 2382.7805141981225
7 1817.8500866006361
8 1426.837505072821
9 1151.5381342398468
10 956.98636499641
11 819.3946918332949
12 722.2262046667747
13 653.8681086736033
14 606.1086533465423
15 573.1116052950965
16 550.7137433799217
17 535.9373457846232
18 526.6485519150738
19 521.3161888065515
20 518.8405899845529
21 518.4316049368354
22 519.5214068056084
23 521.7020372669213
24 524.6805997352349
25 528.2470770088257
26 532.2511970849009
27 536.585793294129
28 541.1748305446235
29 545.9647866815794
30 550.9184475889197
31 556.0104394569062
32 561.2240115645109
33 566.5487192751607
34 571.9787551686168
35 577.5117465275107
36 583.1478883614764
37 588.8893175587291
38 594.7396600494394
39 600.7037019014824
40 606.7871487817029
41 612.9964481127681
42 619.3386552759912
43 625.8213303561788
44 632.4524554831441
45 639.240365580772
46 646.1936871118378
47 653.3212808450917


376 150.31246741878567
377 148.64639068528777
378 146.96859370160382
379 145.2804950933205
380 143.58348849252798
381 141.87894081755076
382 140.16819067089818
383 138.45254687499255
384 136.7332871358958
385 135.01165683369618
386 133.28886793827405
387 131.56609805719927
388 129.8444895982393
389 128.12514904822456
390 126.40914637898095
391 124.69751455564983
392 122.99124915356515
393 121.29130808927584
394 119.59861144213937
395 117.91404136328492
396 116.23844210570678
397 114.57262010721024
398 112.91734417102998
399 111.27334572758991
400 109.6413191530155
401 108.02192217559787
402 106.41577631863765
403 104.8234674326377
404 103.24554624874145
405 101.68252900842344
406 100.13489811384352
407 98.60310284071602
408 97.08756006404292
409 95.58865503483685
410 94.10674217762426
411 92.64214589743642
412 91.19516144285444
413 89.76605573995039
414 88.35506827937206
415 86.9624119871878
416 85.58827412582468
417 84.23281717271311
418 82.89617972809356
419 81.57847739866702
420 80.