In [1]:
import pandas as pd
import numpy as np
import numpy.ma as ma
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

There are 6040(max user id) users and 3952(max movie id) Movies in our database

Strategy
<ol>
    <li>Create a Utility Matrix(M) with the size of Users and Movies</li>
    <li>Update M with the ratings from training dataset</li>
    <li>Decide for d dimension (for U & V)</li>
    <li>Implement the decomposition algorithm</li>
    <li>Initialize U & V matrices</li>
    <li>Train and find optimum U & V matrices</li>
    <li>Evaluate on test set using RMSE values</li>
</ol>

<img src="./images/IMG_1387.jpeg" style="width: 75%; height: auto;">

In [2]:
# Reading the data
user_col_names = ["UserID", "Gender", "Age", "Occupation", "Zipcode"]
udf = pd.read_csv("ml-1m/users.dat", sep="::", header=None, names=user_col_names, engine="python")

movies_col_names = ["MovieID", "Title", "Genres"]
mdf = pd.read_csv("ml-1m/movies.dat", sep="::", header=None, names=movies_col_names, engine="python")

ratings_col_names = ["UserID", "MovieID", "Rating", "Timestamp"]
rdf = pd.read_csv("ml-1m/ratings.dat", sep="::", header=None, names=ratings_col_names, engine="python")

In [3]:
print(f"UserID: min = {udf.UserID.min()}, max = {udf.UserID.max()}")
print(f"MovieID: min = {mdf.MovieID.min()}, max = {mdf.MovieID.max()}")

UserID: min = 1, max = 6040
MovieID: min = 1, max = 3952


In [4]:
# Dividing the data into 5 folds
RandomState = 42

kf = KFold(n_splits=5, shuffle=True, random_state=RandomState)
Folds = []
for train_index, test_index in kf.split(rdf):
    Folds.append((rdf.iloc[train_index, :], rdf.iloc[test_index, :]))

In [5]:
def update_M(m, df):
    m[df["UserID"]-1, df["MovieID"]-1] = df["Rating"]
    return m

def normalize_M(M):
    m = M.copy()
    m2 = ma.masked_array(m, mask=m==0)
    users_mean = m2.mean(axis=1)
    ma.set_fill_value(users_mean, 0)
    movies_mean = m2.mean(axis=0)
    ma.set_fill_value(movies_mean, 0)
    
    i_list, j_list = np.nonzero(m)
    
    m[i_list, j_list] = m[i_list, j_list] - (users_mean[i_list] + movies_mean[j_list])/2
    
    return m, users_mean.filled(), movies_mean.filled()

# RMSE calculation
def calc_rmse(M, U, V, root=True):
    masked = ma.masked_array(M, mask=M==0)
    P = np.dot(U, V)     
    s = np.sum((masked-P)**2)
    if not root:
        return s
    return np.sqrt(s/masked.count())

def decompose_U(M, U, V):
    n, d = U.shape
    for r in range(n):
        for s in range(d):
            U[r,s]= Urs(M, U, V, r, s)
    return U

def Urs(M, U, V, r, s):
    M_slice = ma.masked_array(M, mask=M==0)[r, :]
    ma.set_fill_value(M_slice, 0)
    V_slice = ma.masked_array(V[s, :], M_slice.mask)
    ma.set_fill_value(V_slice, 0)
    sum_array=np.matmul(U[r,:],V[:])-(U[r,s]*V[s,:])
    numerator = np.sum(V[s,:]*(M_slice-sum_array))
    denominator = np.sum(np.square(V_slice))
    if denominator == 0 or ma.is_masked(numerator/denominator):
        return 0
    return numerator/denominator


def decompose_V(M, U, V):
    d, m = V.shape
    for s in range(m):
        for r in range(d):
            V[r,s] = Vrs(M, U, V, r, s)
    return V

def Vrs(M, U, V, r, s):
    M_slice = ma.masked_array(M, mask=M==0)[:, s]
    ma.set_fill_value(M_slice, 0)
    U_slice = ma.masked_array(U[:, r], M_slice.mask)
    ma.set_fill_value(U_slice, 0)
    
    sum_array=np.matmul(U[:],V[:, s])-(V[r,s]*U[:, r])
    numerator = np.sum(U[:, r]*(M_slice-sum_array))
    denominator = np.sum(np.square(U_slice))
    if denominator == 0 or ma.is_masked(numerator/denominator):
        return 0
    return numerator/denominator

In [8]:
d = 10 # TODO: Decide this later 
n =udf.UserID.max()
m = mdf.MovieID.max()
threshold = 0.0001 # TODO: Decide this later
results = []

for _, (train, test) in enumerate(Folds):
    
    k = 4 # TODO: Decide this later
    
    t0 = datetime.now()
    # Step 1 : Create Utility Matrix (M)
    M = np.zeros((n, m))
    # Step 2: Fill Utility Matrix with Ratings from train set 
    M = update_M(M, train)
    #i_listtep 3: Normalize Utility Matrix [Preprocessing]
    M_norm, users_mean, movies_mean = normalize_M(M)
    t1 = datetime.now()
    print(f"Utility Matrix is ready: {str(t1-t0)[:-3]}")
    
    
    # Step 4: Initialize U,V matrices with normal distribution because we normalized M
    U = np.random.normal(size=(n, d))
    V = np.random.normal(size=(d, m))
    
    # Step 5: Performing the Optimization
    optim_rmse_history = []
    optim_rmse = 9999
    print("Optimization has started.")
    t2 = datetime.now()
    while (optim_rmse >= threshold) and (k > 0):
        t4 = datetime.now()
        U = decompose_U(M_norm, U, V)
        t5 = datetime.now()
        # print(f"Decomposition of U: {str(t5-t4)[:-3]}")
        
        V = decompose_V(M_norm, U, V)
        t6 = datetime.now()
        # print(f"Decomposition of V: {str(t6-t5)[:-3]}")
        
        optim_rmse = calc_rmse(M_norm, U, V)
        optim_rmse_history.append(optim_rmse)
        k = k - 1
    t3 = datetime.now()
    print(f"Optimization has finished in {str(t3-t2)[:-3]}" )
    
    P = np.dot(U, V)
    users = test["UserID"] - 1
    movies = test["MovieID"] - 1
    y_pred = P[users, movies]
    
    # Undo normalization
    y_pred = y_pred + (users_mean[users] + movies_mean[movies])/2
    
    y_test = test["Rating"].values
    
    test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # The mean squared error
    print('Root mean squared error: %.2f' % test_rmse)
    
    r2 = r2_score(y_test, y_pred)
    # The coefficient of determination: 1 is perfect prediction
    print('R_2 score: %.2f'% r2_score(y_test, y_pred))
    print("-----------------------------------")
    res = {
        "fold": _,
        "Decomposition_time_U": str(t5-t4)[:-3],
        "Decomposition_time_V": str(t6-t5)[:-3],
        "Optimization_time": str(t3-t2)[:-3],
        "U": U,
        "V": V,
        "optim_rmse_history": optim_rmse_history,
        "test_rmse": test_rmse,
        "r2_score":r2
    }
    
    results.append(res)

Utility Matrix is ready: 0:00:00.740
Optimization has started.
Optimization has finished in 1:47:59.609
Root mean squared error: 1.00
R_2 score: 0.21
-----------------------------------
Utility Matrix is ready: 0:00:00.777
Optimization has started.
Optimization has finished in 1:43:42.094
Root mean squared error: 1.17
R_2 score: -0.10
-----------------------------------
Utility Matrix is ready: 0:00:00.758
Optimization has started.
Optimization has finished in 1:43:39.203
Root mean squared error: 0.98
R_2 score: 0.22
-----------------------------------
Utility Matrix is ready: 0:00:00.746
Optimization has started.
Optimization has finished in 1:43:34.404
Root mean squared error: 0.97
R_2 score: 0.24
-----------------------------------
Utility Matrix is ready: 0:00:00.755
Optimization has started.
Optimization has finished in 0:43:30.874
Root mean squared error: 1.15
R_2 score: -0.05
-----------------------------------


In [9]:
results

[{'fold': 0,
  'Decomposition_time_U': '0:15:36.373',
  'Decomposition_time_V': '0:10:21.555',
  'Optimization_time': '1:47:59.609',
  'U': array([[ 0.28685554, -0.06680916, -1.1316119 , ...,  0.04447826,
           0.04133351,  0.27562102],
         [ 0.03921542, -0.37942463, -0.10458978, ..., -0.01071121,
           0.06523745,  0.17706775],
         [ 0.27535947,  0.59951451, -0.14259722, ..., -0.42911013,
          -0.32776236,  0.28886069],
         ...,
         [ 0.60561523, -0.02153178, -1.14688659, ..., -0.03020611,
           0.35861242, -0.49571626],
         [ 0.37846943, -0.18274371, -0.04636418, ..., -0.26817386,
           0.02830415, -0.03203915],
         [ 0.94038756, -0.40904429,  0.09129197, ..., -0.13307284,
           0.18247142, -0.2276412 ]]),
  'V': array([[ 0.18116063, -0.99401685, -0.97071425, ...,  0.73838602,
           0.74559147,  0.35676166],
         [-0.44432861,  0.16298205,  0.26831466, ..., -1.05650945,
          -1.0327643 ,  0.09806185],
         

In [10]:
import pickle

with open("./decomp_results_d10_k4.pkl", "wb") as write_file:
    pickle.dump(results, write_file)