In [2]:
import pandas as pd
import numpy as np

In [3]:
def convert_matrix(data,user,item,rating):
    matrix = data.pivot(index=user, columns=item, values=rating).as_matrix()
    return matrix   

In [4]:
def get_user_mean(matrix):
    return np.nanmean(matrix, axis = 1)

In [5]:
def centralize_matrix(matrix):
    user_mean = get_user_mean(matrix)
    centered_matrix = matrix - user_mean[:, np.newaxis]
    return centered_matrix

In [6]:
def initialize(matrix,dimension = 1):
    U = np.random.rand(matrix.shape[0],dimension)
    V = np.random.rand(matrix.shape[1],dimension)
    return U,V

In [7]:
def get_position_of_nan(matrix):
    return np.isnan(matrix)

In [8]:
def fill_nan(matrix):
    new_matrix = np.copy(matrix)
    pos = get_position_of_nan(matrix)
    new_matrix[pos] = 0
    return new_matrix,pos

In [9]:
def error_function(matrix,U,V):
    error_matrix =  matrix - U.dot(V.T)
    return error_matrix
    

In [10]:
def gradient_descent(matrix, no_of_iterations = 500, error_tolerance = 0.005, dimension = 2, alpha = 0.001):
    U,V = initialize(matrix,dimension)
    if np.sum(np.isnan(matrix)) > 0:
        matrix, pos = fill_nan(matrix)
    error = 0
    for i in list(range(0,no_of_iterations)):
        error_matrix = error_function(matrix, U, V)
        new_error = 0.5 * np.sum(error_matrix * error_matrix)
        if np.isnan(new_error):
            break
        if np.abs(new_error - error) < error_tolerance:
            break
        print(i,np.abs(new_error - error))
        U_new = U + alpha * error_matrix.dot(V)
        V_new = V + alpha * error_matrix.T.dot(U)
        U = U_new
        V = V_new 
        error = new_error
    return U,V
        

In [22]:
path = '../ml-latest-small/'
filename = 'ratings.csv'

In [23]:
data = pd.read_csv(path+filename)

In [24]:
user_column_name = 'userId'
item_column_name = 'movieId'
rating_column_name = 'rating'

In [34]:
matrix = convert_matrix(data, user_column_name, item_column_name, rating_column_name)
# col_idx = np.array(range(0,6000))
# matrix = matrix[:, col_idx]
# matrix

  


In [28]:
rank = np.array([[7,6,7,4,5,4],[6,7,np.nan,4,3,4],[np.nan,3,3,1,1,np.nan]
                        ,[1,2,2,3,3,4],[1,np.nan,1,2,3,3]])
test_matrix = np.array([[1.5,0.5,1.5,-1.5,-0.5,-1.5],[1.2,2.2,np.nan,-0.8,-1.8,-0.8],[np.nan,1,1,-1,-1,np.nan]
                        ,[-1.5,-0.5,-0.5,0.5,0.5,1.5],[-1,np.nan,-1,0,1,1]])

In [32]:
whole_matrix,position = fill_nan(matrix) 

In [35]:
U,V = gradient_descent(whole_matrix, dimension = 3, alpha = 0.0002)

0 2418869.4077299503
1 1185406.6780339263
2 512897.11248410225
3 63052.005996969296
4 8504.62634463422
5 3576.796196376672
6 2499.2404849259183
7 1895.8321501445025
8 1481.6998240273679
9 1189.9669443005696
10 983.354086500709
11 836.7804958826164
12 732.8380711859791
13 659.3025411176495
14 607.5247713928111
15 571.3514670381555
16 546.3872156022117
17 529.4828836260131
18 518.3772207413567
19 511.44380616757553
20 507.51132120087277
21 505.7353413836099
22 505.5065839908784
23 506.3850880993996
24 508.0529146546032
25 510.28011137561407
26 512.9001990171382
27 515.7925028948812
28 518.8694114994723
29 522.0671846762998
30 525.3393206108594
31 528.6517683181446
32 531.9794717039913
33 535.30387458927
34 538.6111194379628
35 541.8907469528494
36 545.1347572784871
37 548.3369324374944
38 551.4923473303206
39 554.5970170287183
40 557.6476424504071
41 560.6414271263639
42 563.5759453213541
43 566.4490472604521
44 569.2587911244482
45 572.0033944342285
46 574.6811993820593
47 577.290648279

371 162.9753035704489
372 163.62010592030128
373 164.26039884140482
374 164.89562287315493
375 165.52521164051723
376 166.14859220123617
377 166.76518545375438
378 167.37440654292004
379 167.97566534072394
380 168.56836692255456
381 169.1519120822195
382 169.72569789900444
383 170.28911830863217
384 170.84156472142786
385 171.38242665689904
386 171.9110924229608
387 172.42694980371743
388 172.9293867899105
389 173.41779232019326
390 173.89155706210295
391 174.35007419419708
392 174.79274023335893
393 175.2189558550017
394 175.6281267547165
395 176.0196645083488
396 176.39298744586995
397 176.74752155080205
398 177.08270134759368
399 177.39797080872813
400 177.6927842721343
401 177.96660732617602
402 178.21891774371034
403 178.44920637347968
404 178.65697803121293
405 178.84175240376499
406 179.00306491181254
407 179.14046757557662
408 179.2535298653529
409 179.3418395142071
410 179.40500333282398
411 179.44264797429787
412 179.45442069106502
413 179.43999004759826
414 179.3990466086543