In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold 
import math
from sklearn.metrics import mean_absolute_error as mae
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import time

In [2]:
# Loading the dataset
# Using pandas
# Reading dataset
dataset = pd.read_table('ratings.dat', header = None, sep = '::', engine = 'python', usecols = [0,1,2], names = ('UserID','MovieID', 'Ratings'))

dataset.head()

# Are there missing values?
dataset.isna().sum()

# Keep in mind that there are “gaps” in numbering of users and items. (Dictionaries? Renumber everything? …)
# What is meant with this??

UserID     0
MovieID    0
Ratings    0
dtype: int64

In [69]:
# Make the cross validation

n_splits = 5
KF = KFold(n_splits=n_splits, random_state=123, shuffle=True)

# Create Lists 
GlobalAvergage_RMSE, UserAverage_RMSE, MovieAverage_RMSE, LinReg_RMSE, LinRegInter_RMSE = list(),list(),list(),list(),list()
GlobalAvergage_MAE, UserAverage_MAE, MovieAverage_MAE, LinReg_MAE, LinRegInter_MAE = list(),list(),list(),list(),list()

GlobalAvergage_RMSE_test, UserAverage_RMSE_test, MovieAverage_RMSE_test, LinReg_RMSE_test, LinRegInter_RMSE_test = list(),list(),list(),list(),list()
GlobalAvergage_MAE_test, UserAverage_MAE_test, MovieAverage_MAE_test, LinReg_MAE_test, LinRegInter_MAE_test = list(),list(),list(),list(),list()

In [120]:
def User(Train, Test):
    predicted = list()
    user_dict = {}
    Global_mean = Train['Ratings'].mean()
    avg_user = Train.groupby("UserID").mean()['Ratings']
    keys = avg_user.index.values
    for i in keys:
        user_dict[i] = avg_user[i]
    
    for u, m, r in np.array(Test):
        if u in user_dict:
            predicted.append(user_dict[u])
        else:
            predicted.append(Global_mean)
    
    return(predicted)

def Movie(Train, Test):
    predicted = list()
    movie_dict = {}
    Global_mean = Train['Ratings'].mean()
    avg_movie = Train.groupby("MovieID").mean()['Ratings']
    keys = avg_movie.index.values
    for i in keys:
        movie_dict[i] = avg_movie[i]
    
    for u, m, r in np.array(Test):
        if m in movie_dict:
            predicted.append(movie_dict[m])
        else:
            predicted.append(Global_mean)
    return(predicted)

In [121]:
# Naive Approach - User Average
start = time.time()
RMSE_Train = list()
RMSE_Test = list()
MAE_Train = list()
MAE_Test = list()
for train_indexes, test_indexes in KF.split(dataset):
    #Define train and test
    Train_set = dataset.iloc[train_indexes]
    Test_set = dataset.iloc[test_indexes]
    
    predicted_train = User(Train_set, Train_set)
    predicted_test = User(Train_set, Test_set)
    
    RMSE_Train.append(np.sqrt(np.divide(np.sum(np.square(np.subtract(Train_set.iloc[:,2], predicted_train))), len(Train_set))))
    RMSE_Test.append(np.sqrt(np.divide(np.sum(np.square(np.subtract(Test_set.iloc[:,2], predicted_test))), len(Test_set))))
    MAE_Train.append(np.divide(np.sum(np.abs(np.subtract(Train_set.iloc[:,2], predicted_train))), len(Train_set)))
    MAE_Test.append(np.divide(np.sum(np.abs(np.subtract(Test_set.iloc[:,2], predicted_test))), len(Test_set)))
RMSE_Train_mean = np.mean(RMSE_Train)
RMSE_Test_mean = np.mean(RMSE_Test)
MAE_Train_mean = np.mean(MAE_Train)
MAE_Test_mean = np.mean(MAE_Test)
print(f'For the Train set the mean RMSE = {RMSE_Train_mean}; the mean MAE = {MAE_Train_mean}')
print(f'For the Test set the mean RMSE = {RMSE_Test_mean}; the mean MAE = {MAE_Test_mean}')
print('------------------------------------------------------------------------------------------')
# Naive Approach - Movie Average
RMSE_Train = list()
RMSE_Test = list()
MAE_Train = list()
MAE_Test = list()
for train_indexes, test_indexes in KF.split(dataset):
    #Define train and test
    Train_set = dataset.iloc[train_indexes]
    Test_set = dataset.iloc[test_indexes]
    
    predicted_train = Movie(Train_set, Train_set)
    predicted_test = Movie(Train_set, Test_set)
    
    RMSE_Train.append(np.sqrt(np.divide(np.sum(np.square(np.subtract(Train_set.iloc[:,2], predicted_train))), len(Train_set))))
    RMSE_Test.append(np.sqrt(np.divide(np.sum(np.square(np.subtract(Test_set.iloc[:,2], predicted_test))), len(Test_set))))
    MAE_Train.append(np.divide(np.sum(np.abs(np.subtract(Train_set.iloc[:,2], predicted_train))), len(Train_set)))
    MAE_Test.append(np.divide(np.sum(np.abs(np.subtract(Test_set.iloc[:,2], predicted_test))), len(Test_set)))
RMSE_Train_mean = np.mean(RMSE_Train)
RMSE_Test_mean = np.mean(RMSE_Test)
MAE_Train_mean = np.mean(MAE_Train)
MAE_Test_mean = np.mean(MAE_Test)
print(f'For the Train set the mean RMSE = {RMSE_Train_mean}; the mean MAE = {MAE_Train_mean}')
print(f'For the Test set the mean RMSE = {RMSE_Test_mean}; the mean MAE = {MAE_Test_mean}')
end = time.time()
tot_time = np.subtract(end, start)
print(f'Total runtime: {tot_time} s')

For the Train set the mean RMSE = 1.02767191530039; the mean MAE = 0.8227582760976807
For the Test set the mean RMSE = 1.0354915195860825; the mean MAE = 0.8289960153439428
------------------------------------------------------------------------------------------
For the Train set the mean RMSE = 0.9742239953249301; the mean MAE = 0.7783409654104727
For the Test set the mean RMSE = 0.979393117729509; the mean MAE = 0.782305984013413
Total runtime: 9.441281080245972 s
