In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [53]:
df = pd.read_csv("data/final_data.csv")
data_matrix = df.pivot(index='u_id', columns='a_id', values='score').fillna(0)
data_matrix_values = data_matrix.values

In [54]:
# example given in the notebook
'''
Original matrix = 0.5 ? 4
                   1  3 5
'''
U = np.array([[0.7461],
              [1.7966]])
P = np.array([[0.758, 2.5431, 4.7999]])
U@P

array([[0.5655438 , 1.89740691, 3.58120539],
       [1.3618228 , 4.56893346, 8.62350034]])

In [55]:

def als(matrix, rank, iterations, regularization=0.1):
    num_users, num_items = matrix.shape
    U = np.ones((num_users, rank))
    V = np.ones((num_items, rank))
    mask = matrix > 0

    for _ in range(iterations):
        for i in range(num_users):
            V_i = V[mask[i]]
            if V_i.size == 0:
                continue
            A = V_i.T @ V_i + regularization * np.eye(rank)
            b = V_i.T @ matrix[i, mask[i]]
            U[i] = np.linalg.lstsq(A, b, rcond=None)[0]
        
        # Update item factors
        for j in range(num_items):
            U_j = U[mask[:, j]]
            if U_j.size == 0:
                continue
            A = U_j.T @ U_j + regularization * np.eye(rank)
            b = U_j.T @ matrix[mask[:, j], j]
            V[j] = np.linalg.lstsq(A, b, rcond=None)[0]
    
    return U, V.T

In [None]:
# to minimize overfitting, we kept the rank 10 for all tests

In [60]:
U, VT = als(data_matrix_values, rank=10, iterations=10)
predicted_ratings_10_iters = U @ VT

predicted_ratings_10_iters_df = pd.DataFrame(predicted_ratings_10_iters, index=data_matrix.index, columns=data_matrix.columns)
predicted_ratings_10_iters_df

a_id,1,5,6,7,15,16,19,20,21,22,...,39417,39456,39468,39491,39523,39533,39539,39597,39799,40004
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6.990699,7.526884,7.956186,5.702688,7.327281,9.993356,7.295911,8.586788,5.792930,7.735881,...,7.418772,6.309680,6.659972,7.358424,3.000078,8.098090,6.835845,6.005101,3.234678,8.105149
1,9.194580,9.325545,9.565927,8.454279,10.933969,7.926474,10.359003,8.365121,10.046568,7.470284,...,4.944679,6.079224,11.169156,5.873439,4.914455,10.591659,9.436165,8.668546,4.061763,8.874951
2,3.544799,3.186989,3.671386,3.488300,6.082634,2.215627,6.513509,2.851057,7.580717,2.624385,...,4.606369,3.895819,6.353455,1.706768,1.868226,6.341154,5.174367,4.042594,0.362664,3.839642
3,12.777242,8.359745,9.868027,6.114076,6.870583,7.815904,10.429933,6.989832,4.139393,9.585628,...,2.132926,4.669968,2.946289,6.885831,4.802034,7.692794,6.941358,3.227970,5.494610,7.936726
6,11.880909,6.009618,6.698956,6.212885,8.996147,10.610504,5.894985,3.320816,9.889852,6.375262,...,6.471232,-0.755051,3.549214,1.140652,3.162073,5.341452,7.514824,11.310094,-3.119099,4.178701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42845,7.717743,8.429404,8.392085,2.874044,7.326086,8.412269,7.114729,11.153077,0.559217,10.817388,...,2.146919,2.619836,2.584566,6.997557,3.602790,8.213952,4.209383,3.094553,4.023200,5.867582
42896,4.871975,0.575599,4.858513,2.238784,6.351292,1.212430,5.249751,-2.241591,6.553439,0.153930,...,3.611832,0.855669,3.858239,3.257869,7.196361,0.935397,3.919390,1.867977,2.483781,2.837664
42901,0.169237,2.351436,2.921553,2.938832,4.900104,5.423391,2.185828,4.760073,3.938236,2.589047,...,4.795073,3.188724,5.953824,0.962040,0.872670,4.518979,4.115152,3.162258,0.735841,5.010252
44074,4.283396,5.476583,5.981831,1.727246,8.561744,5.899410,4.937027,7.041262,4.565962,6.429383,...,4.058570,0.400157,4.944776,4.636495,4.365617,5.884808,4.018491,5.292863,0.564928,3.209114


In [57]:
U, VT = als(data_matrix_values, rank=10, iterations=100)
predicted_ratings_100_iters = U @ VT

predicted_ratings_100_iters_df = pd.DataFrame(predicted_ratings_100_iters, index=data_matrix.index, columns=data_matrix.columns)
predicted_ratings_100_iters_df

a_id,1,5,6,7,15,16,19,20,21,22,...,39417,39456,39468,39491,39523,39533,39539,39597,39799,40004
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,8.740349,7.060895,7.437728,5.852979,6.442969,11.124381,7.716737,9.459633,3.221958,6.868602,...,9.641647,6.444279,8.102519,7.028709,3.695367,9.223617,5.209386,10.230696,2.952917,8.255700
1,10.358594,9.610798,9.460903,8.305446,8.470412,10.382625,9.803796,9.608073,7.448907,7.581832,...,7.967820,6.943012,8.953687,8.528474,5.393138,9.304280,8.695611,10.654826,4.787289,7.964686
2,6.949002,4.481722,3.478635,1.180444,3.179646,4.688468,5.678563,3.237319,10.739329,4.544956,...,1.149612,-0.465375,6.916067,-0.614088,0.496833,3.412200,10.253184,5.167423,-0.296874,4.747418
3,11.215499,8.680216,9.174967,5.334028,8.558085,9.245398,9.167118,7.998683,4.376514,8.179650,...,8.319976,4.766996,8.236526,5.917020,4.788911,7.879284,7.372401,10.035298,4.150753,8.222857
6,11.106172,0.632047,5.303914,3.875627,8.279392,8.965724,2.863416,-1.821553,11.172346,13.948766,...,0.485007,-1.938723,8.893664,-2.075327,6.631371,4.318307,8.806051,11.146426,4.018656,3.729811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42845,10.008188,9.204126,10.548027,5.953751,9.926984,8.172123,7.861950,8.118583,3.070977,9.626656,...,9.430847,4.801181,7.368579,8.119814,7.027391,7.998678,4.533220,9.953182,4.834942,8.398569
42896,4.027637,4.158584,5.236026,3.905183,6.197947,2.330493,4.562467,0.736619,7.393636,4.740152,...,2.416369,2.052996,4.644606,4.186331,5.273606,3.648129,3.322230,3.042539,2.452293,3.924915
42901,4.710006,3.033580,4.310236,4.880173,4.229723,6.575305,4.412831,3.521343,1.303896,5.075446,...,5.486141,4.520320,4.593132,3.261428,3.264622,6.677512,1.790184,5.544947,2.925535,4.873788
44074,5.457444,6.422723,5.640341,4.759963,6.662231,6.927931,6.227021,7.452327,4.869887,3.927987,...,6.129622,4.922433,6.246826,7.899752,4.012128,5.469066,4.907200,7.047221,2.248831,5.162342


In [None]:
U, VT = als(data_matrix_values, rank=10, iterations=1000)
predicted_ratings_1000_iters = U @ VT

predicted_ratings_1000_iters_df = pd.DataFrame(predicted_ratings_1000_iters, index=data_matrix.index, columns=data_matrix.columns)
predicted_ratings_1000_iters_df

In [64]:
print("min values for 10, 100, and 1000 iterations")
print(predicted_ratings_10_iters.min())
print(predicted_ratings_100_iters.min())
print(predicted_ratings_1000_iters.min())
print("<------------------>")
print("max values for 10, 100, and 1000 iterations")
print(predicted_ratings_10_iters.max())
print(predicted_ratings_100_iters.max())
print(predicted_ratings_1000_iters.max())

min values for 10, 100, and 1000 iterations
-40.79822759036206
-29.303566773132456
-19.12654763695706
<------------------>
max values for 10, 100, and 1000 iterations
47.99847511685177
39.06330999231461
32.08637080018278
