In [1]:
import pandas as pd
import numpy as np
import numpy.ma as ma
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Reading the data
user_col_names = ["UserID", "Gender", "Age", "Occupation", "Zipcode"]
udf = pd.read_csv("ml-1m/users.dat", sep="::", header=None, names=user_col_names, engine="python")

movies_col_names = ["MovieID", "Title", "Genres"]
mdf = pd.read_csv("ml-1m/movies.dat", sep="::", header=None, names=movies_col_names, engine="python")

ratings_col_names = ["UserID", "MovieID", "Rating", "Timestamp"]
rdf = pd.read_csv("ml-1m/ratings.dat", sep="::", header=None, names=ratings_col_names, engine="python")

In [3]:
print(f"UserID: min = {udf.UserID.min()}, max = {udf.UserID.max()}")
print(f"MovieID: min = {mdf.MovieID.min()}, max = {mdf.MovieID.max()}")

UserID: min = 1, max = 6040
MovieID: min = 1, max = 3952


In [4]:
# Dividing the data into 5 folds
RandomState = 42

kf = KFold(n_splits=5, shuffle=True, random_state=RandomState)
Folds = []
for train_index, test_index in kf.split(rdf):
    Folds.append((rdf.iloc[train_index, :], rdf.iloc[test_index, :]))

In [5]:
def update_M(m, df):
    m[df["UserID"]-1, df["MovieID"]-1] = df["Rating"]
    return m

# RMSE calculation
def calc_rmse(M, U, V, root=True):
    masked = ma.masked_array(M, mask=M==0)
    P = np.dot(U, V)     
    s = np.sum((masked-P)**2)
    if not root:
        return s
    return np.sqrt(s/masked.count())

In [37]:
K = 10 # TODO: Decide this later 
I = udf.UserID.max()
J = mdf.MovieID.max()
lr = 0.0045 # learning rate
rf = 0.01 # regularization factor
max_iter = 500
results = []

for _, (train, test) in enumerate(Folds):
    
    t0 = datetime.now()
    # Step 1 : Create Utility Matrix (M)
    M = np.zeros((I, J))
    # Step 2: Fill Utility Matrix with Ratings from train set 
    M = update_M(M, train)
    t1 = datetime.now()
    print(f"Utility Matrix is ready: {str(t1-t0)[:-3]}")
    
    # Step 4: Initialize U,V matrices with normal distribution because we normalized M
    U = np.random.normal(size=(I, K))
    V = np.random.normal(size=(K, J))
    
    i_list, j_list = np.nonzero(M)
    
    rmse_prev = 99999
    rmse_cycle = 0
    
    t2 = datetime.now()

    rmse_history = []
    
    iter_num = 0
    while (rmse_cycle < 2) and (iter_num < max_iter):
        for i, j in zip(i_list, j_list):
            e_ij = M[i,j] - np.dot(U[i,:], V[:,j])
            U[i, :] = U[i, :] + lr*(e_ij*V[:,j] - rf*U[i, :])
            V[:, j] = V[:, j] + lr*(e_ij*U[i, :] - rf*V[:,j])
            
        rmse = calc_rmse(M, U, V)
        rmse_history.append(rmse)
        
        if rmse < rmse_prev:
            rmse_cycle = 0
            rmse_prev = rmse
        else:
            rmse_cycle += 1
            
        iter_num += 1
        
    t3 = datetime.now()
    print(f"Training time: {t3-t2}")
    
    P = np.dot(U, V)
    users = test["UserID"] - 1
    movies = test["MovieID"] - 1
    y_pred = P[users, movies]
    y_test = test["Rating"]
    
    test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # The mean squared error
    print('Root mean squared error: %.2f' % test_rmse)
    
    r2 = r2_score(y_test, y_pred)
    # The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f'% r2_score(y_test, y_pred))
    
    print("--------------------------------------------")
    
    res = {
        "fold": _,
        "K": K,
        "lr": lr,
        "rf": rf,
        "max_iter": max_iter,
        "Training time": str(t3-t2)[:-3],
        "U": U,
        "V": V,
        "rmse_history": rmse_history,
        "test_rmse": test_rmse,
        "r2_score":r2
    }
    
    results.append(res)

Utility Matrix is ready: 0:00:00.097
Training time: 2:01:31.459952
Root mean squared error: 0.89
Coefficient of determination: 0.36
--------------------------------------------
Utility Matrix is ready: 0:00:00.149
Training time: 2:19:40.795504
Root mean squared error: 0.89
Coefficient of determination: 0.36
--------------------------------------------
Utility Matrix is ready: 0:00:00.110
Training time: 1:54:19.398843
Root mean squared error: 0.89
Coefficient of determination: 0.36
--------------------------------------------
Utility Matrix is ready: 0:00:00.102
Training time: 1:54:38.892146
Root mean squared error: 0.89
Coefficient of determination: 0.36
--------------------------------------------
Utility Matrix is ready: 0:00:00.102
Training time: 1:48:17.004304
Root mean squared error: 0.89
Coefficient of determination: 0.36
--------------------------------------------


In [38]:
results[0]["test_rmse"]

0.893120519367731

In [39]:
import pickle

with open("./mf_results_3.pkl", "wb") as write_file:
    pickle.dump(results, write_file)