In [1]:
import numpy as np
import pandas as pd
import os.path
from random import randint

ratings_file = './data/ratings.csv'
predictions_file = './data/predictions.csv'

ratings_description = pd.read_csv(ratings_file, delimiter=';',
                                  dtype={'userID': 'int', 'movieID': 'int', 'rating': 'int'},
                                  names=['userID', 'movieID', 'rating'])

num_movies = max(ratings_description["movieID"])
num_users = max(ratings_description["userID"])
R = np.zeros(( num_movies, num_users))

for user, movie, rating in ratings_description.values:
    R[movie-1, user-1] = rating

R[R==0] = np.nan
print(f"Shape of Utility matrix is (movies, users): {R.shape}")
R

Shape of Utility matrix is (movies, users): (3706, 6040)


array([[ 5., nan, nan, ..., nan, nan,  3.],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [None]:
from tqdm.auto import tqdm

EPOCHS = 3000
LEARNING_RATE = 0.0001
LAMBDA = 0.1


K = 2 # number of factors to work with.

np.random.seed(42)

ratings = ratings_description.values.copy()
ratings[:, 0:2] = ratings[:, 0:2] - 1
Q = np.random.uniform(-1, 1, (R.shape[0], K)) # movies
P = np.random.uniform(-1, 1, (K, R.shape[1])) # users
div = (R.shape[0] * R.shape[1]) - np.isnan(R).sum()
RMSE = np.sqrt(((np.nan_to_num(R - np.matmul(Q, P), 0)**2).sum())/div)
print(f"Starting RMSE: {RMSE}")

sgd_learning_curve = []
for epoch in tqdm(range(EPOCHS)):
    np.random.shuffle(ratings) # inplace shuffle of matrix
    R_pred = np.matmul(Q, P)
    curr_error = 2*np.nan_to_num(R - R_pred, 0)
    for userID, movieID, rating in ratings:
        q_update = LEARNING_RATE * (curr_error[movieID, userID]*P[:, userID] - LAMBDA*Q[movieID, :])
        Q[movieID, :] = Q[movieID, :] + q_update

        p_update = LEARNING_RATE * (curr_error[movieID, userID]*Q[movieID, :] - LAMBDA*P[:, userID])
        P[:, userID] = P[:, userID] + p_update

    RMSE_i = np.sqrt(((np.nan_to_num(R - np.matmul(Q, P), 0)**2).sum())/div)
    print(f"RMSE {epoch}: {RMSE_i}")
    sgd_learning_curve.append([epoch, RMSE_i])

RMSE = np.sqrt(((np.nan_to_num(R - np.matmul(Q, P), 0)**2).sum())/div)
print(f"Final RMSE: {RMSE}")

Starting RMSE: 3.781830053581588


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3000.0), HTML(value='')))

RMSE 0: 3.7760681841922086
RMSE 1: 3.7717238810448235
RMSE 2: 3.7682908832146165
RMSE 3: 3.765442377327008
RMSE 4: 3.7629325572599654
RMSE 5: 3.7605208308055973
RMSE 6: 3.7579073275790793
RMSE 7: 3.7546406907342527
RMSE 8: 3.7499445707048884
RMSE 9: 3.7424623462825113
RMSE 10: 3.7297521165962038
RMSE 11: 3.707522186153162
RMSE 12: 3.6684981665579923
RMSE 13: 3.601067506749254
RMSE 14: 3.489643638591599
RMSE 15: 3.3196461686844456
RMSE 16: 3.091654534406349
RMSE 17: 2.8346880492174136
RMSE 18: 2.591261032792766
RMSE 19: 2.382767114992828
RMSE 20: 2.2077689841122425
RMSE 21: 2.06015495907341
RMSE 22: 1.9349892384458687
RMSE 23: 1.8283433335357147
RMSE 24: 1.736977169617252
RMSE 25: 1.658231536587939
RMSE 26: 1.5899426599317372
RMSE 27: 1.5303553862808947
RMSE 28: 1.4780633656331663
RMSE 29: 1.4319162812223387
RMSE 30: 1.390980450149969
RMSE 31: 1.3544949322181143
RMSE 32: 1.3218345683712083
RMSE 33: 1.2924798526406462
RMSE 34: 1.2659957585472057
RMSE 35: 1.2420166897908906
RMSE 36: 1.220

RMSE 288: 0.9031435645139856
RMSE 289: 0.9031080258550852
RMSE 290: 0.9030722825877826
RMSE 291: 0.9030364077334693
RMSE 292: 0.9030008005231888
RMSE 293: 0.9029650417573997
RMSE 294: 0.9029304077592468
RMSE 295: 0.9028943517232482
RMSE 296: 0.9028598707387193
RMSE 297: 0.9028240729119279
RMSE 298: 0.9027895787976926
RMSE 299: 0.9027543561319621
RMSE 300: 0.9027204613311136
RMSE 301: 0.9026848792481122
RMSE 302: 0.9026510673780909
RMSE 303: 0.9026156709102439
RMSE 304: 0.9025816767772041
RMSE 305: 0.902547257378369
RMSE 306: 0.9025135690510103
RMSE 307: 0.9024781875114444
RMSE 308: 0.9024453663214629
RMSE 309: 0.9024098172760305
RMSE 310: 0.9023775025594226
RMSE 311: 0.9023415179191997
RMSE 312: 0.9023098565531784
RMSE 313: 0.9022736339520641
RMSE 314: 0.9022421509351438
RMSE 315: 0.9022063330139445
RMSE 316: 0.9021747924212336
RMSE 317: 0.9021391899491149
RMSE 318: 0.9021078109915547
RMSE 319: 0.9020719783008758
RMSE 320: 0.9020410217627606
RMSE 321: 0.9020049811335861
RMSE 322: 0.901

  q_update = LEARNING_RATE * (curr_error[movieID, userID]*P[:, userID] - LAMBDA*Q[movieID, :])
  p_update = LEARNING_RATE * (curr_error[movieID, userID]*Q[movieID, :] - LAMBDA*P[:, userID])
  Q[movieID, :] = Q[movieID, :] + q_update
  p_update = LEARNING_RATE * (curr_error[movieID, userID]*Q[movieID, :] - LAMBDA*P[:, userID])


RMSE 429: 0.0
RMSE 430: 0.0
RMSE 431: 0.0
RMSE 432: 0.0
RMSE 433: 0.0
RMSE 434: 0.0
RMSE 435: 0.0
RMSE 436: 0.0
RMSE 437: 0.0
RMSE 438: 0.0
RMSE 439: 0.0
RMSE 440: 0.0
RMSE 441: 0.0
RMSE 442: 0.0
RMSE 443: 0.0
RMSE 444: 0.0
RMSE 445: 0.0
RMSE 446: 0.0
RMSE 447: 0.0
RMSE 448: 0.0
RMSE 449: 0.0
RMSE 450: 0.0
RMSE 451: 0.0
RMSE 452: 0.0
RMSE 453: 0.0
RMSE 454: 0.0
RMSE 455: 0.0
RMSE 456: 0.0
RMSE 457: 0.0
RMSE 458: 0.0
RMSE 459: 0.0
RMSE 460: 0.0
RMSE 461: 0.0
RMSE 462: 0.0
RMSE 463: 0.0
RMSE 464: 0.0
RMSE 465: 0.0
RMSE 466: 0.0
RMSE 467: 0.0
RMSE 468: 0.0
RMSE 469: 0.0
RMSE 470: 0.0
RMSE 471: 0.0
RMSE 472: 0.0
RMSE 473: 0.0
RMSE 474: 0.0
RMSE 475: 0.0
RMSE 476: 0.0
RMSE 477: 0.0
RMSE 478: 0.0
RMSE 479: 0.0
RMSE 480: 0.0
RMSE 481: 0.0
RMSE 482: 0.0
RMSE 483: 0.0
RMSE 484: 0.0
RMSE 485: 0.0
RMSE 486: 0.0
RMSE 487: 0.0
RMSE 488: 0.0
RMSE 489: 0.0
RMSE 490: 0.0
RMSE 491: 0.0
RMSE 492: 0.0
RMSE 493: 0.0
RMSE 494: 0.0
RMSE 495: 0.0
RMSE 496: 0.0
RMSE 497: 0.0
RMSE 498: 0.0
RMSE 499: 0.0
RMSE 5