In [62]:
import pandas as pd
import numpy as np

#Loading data
# movies = pd.read_csv("../data/ml-1m/movies.dat", sep="::", header = None)
ratings = pd.read_csv("../data/ml-1m/ratings.dat", sep="::", header = None)
# users = pd.read_csv("../data/ml-1m/users.dat", sep="::", header = None)

#col names
# movies.columns = ["MovieID","MovieTitle","Genre"]
ratings.columns = ["UserID","MovieID","Rating","TimeStamp"]
# users.columns = ["UserID","Sex","AgeGroup","OccupationGroup","ZipCode"]

#setseed
np.random.seed(348)



  


In [280]:
def train_MF(ratings, N):
    
    num_factors=10 #
    #num_iter= 75
    Lambda = 0.05 #regularization
    eta =0.005 #learning rate

    # initialize U and M
    U_rows = ratings["UserID"].unique() 
    M_cols = ratings["MovieID"].unique()
    U = np.random.rand(len(U_rows), num_factors)
    M = np.random.rand(num_factors, len(M_cols) )


    num_iter = N
    num_elem = ratings.shape[0]

    RMSE = np.zeros(num_iter)
    MAE = np.zeros(num_iter)

    for i in range(num_iter):
        SE = 0; AE = 0
        print("\nIteration", i+1)
        for j in range(num_elem):

            #time consuming? initalize proper index matrix (j*2)?
            ind_i = np.where(U_rows == ratings.UserID.iloc[j])[0]
            ind_j = np.where(M_cols == ratings.MovieID.iloc[j])[0]

            x_hat = np.dot(U[ind_i,:],M[:,ind_j])
            eij = ratings.Rating.iloc[j] - x_hat
            SE = SE + (eij)**2
            AE = AE + abs(eij)

            #Update with gradients:
            U[ind_i,:] = U[ind_i,:] + eta * (2 * eij * np.transpose(M[:,ind_j]) - Lambda * U[ind_i,:])
            M[:,ind_j] = M[:,ind_j] + eta * (2 * eij * np.transpose(U[ind_i,:]) - Lambda * M[:,ind_j])

#             if j % 200000 == 0:
#                 print("element number:",j)

        RMSE[i] = np.sqrt(SE/num_elem)
        MAE[i] = AE / num_elem
        print("RMSE =", RMSE[i])
    
    return(U, M, RMSE, MAE)

#train_MF(ratings,10)



In [282]:
#setseed
np.random.seed(59)
import time
Tstart = time.time()

#Cross validation:
Nfolds = 5
No_train_iter = 75

#Create folds grouping vector
Nrep = ratings.shape[0] // Nfolds
a = np.repeat(np.arange(Nfolds),Nrep)
b = np.arange(ratings.shape[0] % Nfolds)
folds_vec = np.concatenate([a,b])
np.random.shuffle(folds_vec)

RMSE_train = np.zeros(Nfolds)
RMSE_test = np.zeros(Nfolds)
MAE_train = np.zeros(Nfolds)
MAE_test = np.zeros(Nfolds)

for k in range(Nfold):
    print("\nFOLD:",k+1)
    
    ind_test = folds_vec == k
    ind_train = ~ ind_test
    
    rat_train = ratings[ind_train]
    rat_test = ratings[ind_test]
    
    print("Training on trainset....")
    U_train, M_train, RMSE, MAE = train_MF(rat_train, No_train_iter)
    U_rows = rat_train["UserID"].unique() 
    M_cols = rat_train["MovieID"].unique()
    
    RMSE_train[k] = RMSE[-1]
    MAE_train[k] = MAE[-1]
    
    print("Predicting on testset....")
    #fallback
    u_mean = U_train.mean(0)
    m_mean = M_train.mean(1)
    
    no_elem = len(rat_test.Rating)
    SE = 0
    AE = 0
    for j in range(no_elem):
        m_col = np.where(M_cols == rat_test.MovieID.iloc[j])[0]
        if np.any(m_col):
            m = M_train[:,m_col]
        else:
            m = m_mean
        
        u_row = np.where(U_rows == rat_test.UserID.iloc[j])[0]
        if np.any(m_col):
            u = U_train[u_row,:]
        else:
            u = u_mean
            
        x_hat = np.dot(u,m)
        eij = rat_test.Rating.iloc[j] - x_hat
        SE = SE + eij**2
        AE = AE + abs(eij)
        
    RMSE_test[k]  = np.sqrt(SE/no_elem)
    MAE_test[k] = AE / no_elem
    print("RMSE_test =", RMSE_test[k])
    
Tend = time.time()



FOLD: 1
Training on trainset....

Iteration 1
RMSE = 0.9933779664366086

Iteration 2
RMSE = 0.9276793652267045

Iteration 3
RMSE = 0.9173878768697004

Iteration 4
RMSE = 0.9126829223696495

Iteration 5
RMSE = 0.9077787324582453

Iteration 6
RMSE = 0.9009707702667302

Iteration 7
RMSE = 0.8918169725267558

Iteration 8
RMSE = 0.8818346725164279

Iteration 9
RMSE = 0.8725026243226548

Iteration 10
RMSE = 0.8641802170471644

Iteration 11
RMSE = 0.856820869028465

Iteration 12
RMSE = 0.8503216558765048

Iteration 13
RMSE = 0.8445687130053252

Iteration 14
RMSE = 0.8394534655007496

Iteration 15
RMSE = 0.8348822910031151

Iteration 16
RMSE = 0.8307780256474002

Iteration 17
RMSE = 0.8270778172259651

Iteration 18
RMSE = 0.823730378186643

Iteration 19
RMSE = 0.8206935299156083

Iteration 20
RMSE = 0.8179321426771825

Iteration 21
RMSE = 0.8154164679953727

Iteration 22
RMSE = 0.813120862427226

Iteration 23
RMSE = 0.8110228713888072

Iteration 24
RMSE = 0.8091026035397862

Iteration 25
RMSE

RMSE = 0.7869725650844311

Iteration 53
RMSE = 0.7866633466023556

Iteration 54
RMSE = 0.7863680114241463

Iteration 55
RMSE = 0.7860857055840996

Iteration 56
RMSE = 0.7858156384662003

Iteration 57
RMSE = 0.7855570775672808

Iteration 58
RMSE = 0.7853093437252446

Iteration 59
RMSE = 0.7850718067665249

Iteration 60
RMSE = 0.7848438815329876

Iteration 61
RMSE = 0.7846250242527834

Iteration 62
RMSE = 0.7844147292238234

Iteration 63
RMSE = 0.7842125257816912

Iteration 64
RMSE = 0.7840179755266417

Iteration 65
RMSE = 0.783830669786587

Iteration 66
RMSE = 0.7836502272952617

Iteration 67
RMSE = 0.7834762920669934

Iteration 68
RMSE = 0.7833085314502521

Iteration 69
RMSE = 0.7831466343447994

Iteration 70
RMSE = 0.7829903095679858

Iteration 71
RMSE = 0.7828392843567852

Iteration 72
RMSE = 0.7826933029944029

Iteration 73
RMSE = 0.7825521255499495

Iteration 74
RMSE = 0.7824155267215366

Iteration 75
RMSE = 0.782283294773947
Predicting on testset....
RMSE_test = 0.8783780942644716

In [283]:
RMSE_train


array([0.78285579, 0.78286786, 0.78228329, 0.78318068, 0.78362326])

In [284]:
RMSE_test

array([0.87450032, 0.87789722, 0.87837809, 0.87688674, 0.87567991])

In [285]:
MAE_train

array([0.61569152, 0.61595894, 0.61565918, 0.61615892, 0.61624128])

In [286]:
MAE_test

array([0.68480832, 0.68693823, 0.6871088 , 0.68628395, 0.68543317])

In [289]:
(Tend - Tstart) /3600

11.05890316499604