In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold 
import math
from sklearn.metrics import mean_absolute_error as mae
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import time

In [2]:
# Loading the dataset
# Using pandas
import pandas as pd
# Reading dataset
dataset = pd.read_table('ratings.dat', header = None, sep = '::', engine = 'python', usecols = [0,1,2], names = ('UserID','MovieID', 'Ratings'))

dataset.head()

# Are there missing values?
dataset.isna().sum()

# Keep in mind that there are “gaps” in numbering of users and items. (Dictionaries? Renumber everything? …)
# What is meant with this??
dataset_task3 = dataset.pivot(index='UserID', columns='MovieID', values='Ratings').fillna(0)

In [17]:
def MatrixFactorization(data, num_factors, num_iter, regularization, learn_rate, num_folds):
    # Cross validation
    start = time.time()
    # percentage train data (5 fold = 80% train data)
    num_folds = num_folds
    traindata_percentage = 0.8
    
    # Create RMSE for train and test for every fold
    RMSE_fold_train = [0] * num_folds
    RMSE_fold_test = [0] * num_folds

    for fold in range(num_folds):
        
        # Shuffle the indexes for the cross validation 
        indexes = np.arange(len(data.index))
        np.random.shuffle(indexes)

        # train = 1: till 80% of the indexes and test = the last 20%
        train_index = indexes[1:round(traindata_percentage*len(indexes))]
        test_index = indexes[round(traindata_percentage*len(indexes)):]
        train_ = data.iloc[train_index]
        test_ = data.iloc[test_index]
        train = train_.sort_index()
        test = test_.sort_index()
        
        print(train)
        
        # make an array of the data
        train_array = np.array(train)
        test_array = np.array(test)
        
        # Specifiy the I and J for the matrices 
        I = data.shape[0]
        J = data.shape[1]
        K = num_factors
        
        # Initialize random weights
        U = np.random.rand(I, K)
        M = np.random.rand(K, J)
                
        # Create empty list for train and test
        RMSE_list_train = []
        RMSE_list_test = []
        
        for iter in range(num_iter):
            e_ij2 = 0
            e_ij2_test = 0

            # if the value in the matrix is higher than 0, for every i and j
            for i in range(len(train_array)):
                for j in range(train_array.shape[1]):
                    if train_array[i][j]>0:
                      
                      # calculate the prediction with the intiliazed weights
                        pred = np.dot(U[i,:], M[:,j])

                        # Keep ratings between 1 and 5
                        if pred < 1:
                            pred = 1
                        elif pred > 5:
                            pred = 5

                        # Calculate the error 
                        e_ij = np.subtract(train_array[i][j], pred)
                        e_ij2 += np.square(e_ij)

                        #Update the weights
                        for k in range(num_factors):
                            grad_eij_u = -2 * e_ij * M[k][j]
                            grad_eij_m = -2 * e_ij * U[i][k]

                            U[i][k] = U[i][k] + learn_rate * ( - grad_eij_u - regularization * U[i][k] )
                            M[k][j] = M[k][j] + learn_rate * ( - grad_eij_m - regularization * M[k][j] )


            # N: the number of known values in the Matrix
            N = np.count_nonzero(train_array)
            RMSE_iter_train = np.sqrt(np.divide(e_ij2, N))
            print(RMSE_iter_train)

            RMSE_list_train.append(RMSE_iter_train)
            
            # if the value in the matrix is higher than 0, for every i and j Test
            for i in range(len(test_array)):
                for j in range(test_array.shape[1]):
                    if test_array[i][j]>0:
                        # calculate the prediction with the weights Test
                        pred = np.dot(U[i,:], M[:,j])

                        # Keep ratings between 1 and 5
                        if pred < 1:
                            pred = 1
                        elif pred > 5:
                            pred = 5

                        # Calculate the error Test
                        e_ij_test = np.subtract(test_array[i][j], pred)
                        e_ij2_test += np.square(e_ij_test)
                            
            # N: the number of known values in the Matrix
            N_test = np.count_nonzero(test_array)
            RMSE_iter_test = np.sqrt(np.divide(e_ij2_test, N_test))
            print(RMSE_iter_test)

            RMSE_list_test.append(RMSE_iter_test)

            if len(RMSE_list_train) >= 2 or len(RMSE_list_train) >= 2:
                if RMSE_list_train[-2] <= RMSE_list_train[-1]:
                    break
                elif RMSE_list_test[-2] <= RMSE_list_test[-1]:
                    break
        
        print(RMSE_list_train)
        print(RMSE_list_test)
        RMSE_fold_train[fold] = RMSE_list_train[-1]
        RMSE_fold_test[fold] = RMSE_list_test[-1]

    print(RMSE_fold_train)
    print(RMSE_fold_test)
    
    end = time.time()
    print("Total runtime: ", (end-start))
    return U, M

In [18]:
nU, nM = MatrixFactorization(dataset_task3, num_factors=10, num_iter=32, regularization=0.05, learn_rate=0.005, num_folds = 5)
nU, nM

MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1         5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6         4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7         0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6035      4.0   0.0   1.0   2.0   1.0   0.0   3.0   0.0   0.0   0.0  ...   
6037      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6038      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6039      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6040      3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

MovieID  39

0.979981387191881
1.1482476250111697
0.9209872547353999
1.0534171864808384
0.9118275833150951
1.0533231034833257
0.9061954225199389
1.0541007243143183
[0.979981387191881, 0.9209872547353999, 0.9118275833150951, 0.9061954225199389]
[1.1482476250111697, 1.0534171864808384, 1.0533231034833257, 1.0541007243143183]
MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
2         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6         4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
7         0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0  ...   
9         5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6036      0.0   0.0   0.0   2.0   0.0   3.0   0.0   0.0   0.0   0.0  ...   
6037

(array([[0.31089843, 0.79932989, 1.02267227, ..., 0.67440731, 0.40059354,
         0.26857073],
        [0.4133615 , 0.33101844, 0.51609359, ..., 0.41584213, 0.64061597,
         0.91856984],
        [0.18000914, 0.57480554, 0.65702614, ..., 0.66755765, 0.7839816 ,
         0.93635248],
        ...,
        [0.78119742, 0.40096722, 0.1083675 , ..., 0.33629391, 0.58336889,
         0.44689472],
        [0.84050566, 0.78667883, 0.91515889, ..., 0.53832043, 0.69582681,
         0.97287657],
        [0.29998458, 0.34572802, 0.10229498, ..., 0.94244078, 0.98211777,
         0.21058375]]),
 array([[0.70950314, 0.51216697, 0.4653426 , ..., 0.56523393, 0.77479732,
         0.74206542],
        [0.87550893, 0.53409805, 0.70280214, ..., 0.865281  , 1.03222931,
         0.5835764 ],
        [0.56912713, 0.50498631, 0.31613828, ..., 0.55542874, 0.75702564,
         0.6727539 ],
        ...,
        [0.75689229, 0.64331651, 0.35178683, ..., 0.37098353, 0.65075066,
         0.78205405],
        [0.5