In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
df = pd.read_csv("data/100x100.csv")
data_matrix = df.pivot(index='u_id', columns='a_id', values='score').fillna(0)
data_matrix_values = data_matrix.values

In [37]:
# example given in the notebook
'''
Original matrix = 0.5 ? 4
                   1  3 5
'''
U = np.array([[0.7461],
              [1.7966]])
P = np.array([[0.758, 2.5431, 4.7999]])
prediction = U@P

In [38]:
def als(matrix, rank, iterations, regularization=0.1):
    num_users, num_items = matrix.shape
    X = np.ones((num_users, rank)) 
    Y = np.ones((num_items, rank)) 
    mask = matrix > 0

    for _ in range(iterations):
        for i in range(num_users):
            Y_i = Y[mask[i]]
            if Y_i.size == 0:
                continue
            A = Y_i.T @ Y_i + regularization * np.eye(rank)
            b = Y_i.T @ matrix[i, mask[i]]
            X[i] = np.linalg.lstsq(A, b, rcond=None)[0]

        for j in range(num_items):
            X_j = X[mask[:, j]]
            if X_j.size == 0:
                continue
            A = X_j.T @ X_j + regularization * np.eye(rank)
            b = X_j.T @ matrix[mask[:, j], j]
            Y[j] = np.linalg.lstsq(A, b, rcond=None)[0]

    return X, Y


In [39]:
# usage
X, Y = als(data_matrix_values, rank=20, iterations=10)

In [40]:
# we tested for different rank values between 10 and 100, and found that 20 works the best for not overfitting
rank = 20

In [41]:
X, Y = als(data_matrix_values, rank=rank, iterations=10)
predicted_ratings_10_iters = X @ Y.T

predicted_ratings_10_iters_df = pd.DataFrame(predicted_ratings_10_iters, index=data_matrix.index, columns=data_matrix.columns)

In [42]:
X, Y =  als(data_matrix_values, rank=rank, iterations=100)
predicted_ratings_100_iters = X @ Y.T

predicted_ratings_100_iters_df = pd.DataFrame(predicted_ratings_100_iters, index=data_matrix.index, columns=data_matrix.columns)

In [43]:
X, Y = als(data_matrix_values, rank=rank, iterations=1000)
predicted_ratings_1000_iters = X @ Y.T
predicted_ratings_1000_iters_df = pd.DataFrame(predicted_ratings_1000_iters, index=data_matrix.index, columns=data_matrix.columns)

In [44]:
'''
print("min values for 10, 100, and 1000 iterations")
print(predicted_ratings_10_iters.min())
print(predicted_ratings_100_iters.min())
print(predicted_ratings_1000_iters.min())
print("<------------------>")
print("max values for 10, 100, and 1000 iterations")
print(predicted_ratings_10_iters.max())
print(predicted_ratings_100_iters.max())
print(predicted_ratings_1000_iters.max())
'''

'\nprint("min values for 10, 100, and 1000 iterations")\nprint(predicted_ratings_10_iters.min())\nprint(predicted_ratings_100_iters.min())\nprint(predicted_ratings_1000_iters.min())\nprint("<------------------>")\nprint("max values for 10, 100, and 1000 iterations")\nprint(predicted_ratings_10_iters.max())\nprint(predicted_ratings_100_iters.max())\nprint(predicted_ratings_1000_iters.max())\n'