In [20]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from Modules.dataEngineering import dataEngineering
dataEngineering = dataEngineering()

In [21]:
df_ratings        = dataEngineering.loadRatings()['data']

In [22]:
count_rating = df_ratings.shape[0]
print("Rating Count:", count_rating)

Rating Count: 1000209


In [23]:
df_ratings.head(5)

Unnamed: 0,UserID,MovieID,userRating,Title,Genres,meanRating
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,4.39
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,4.39
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama,4.39
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama,4.39
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,4.39


In [24]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import SVD, KNNWithMeans, KNNBasic, KNNWithZScore

options_KNNwithMeans = {
    'k': 40,
    'sim_options': {'name': 'pearson', 'user_based': True},
    'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}
}

option_SVD = {
    'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.4
}

option_KNNBasic = {
    'k': 100,
    'sim_options': {'name': 'pearson', 'user_based': False},
    'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}
}

option_KNNWithZScore = {
    'k': 40,
    'sim_options': {'name': 'pearson', 'user_based': True},
    'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}
}

# Define the metrics you want to use for evaluation
metrics = ['rmse', 'mae']

# จำนวน Fold
fold = 5

In [25]:
# Create a Reader object
reader = Reader(rating_scale=(1, 5))

# Load the dataset using the Reader
data = Dataset.load_from_df(df_ratings[['UserID', 'MovieID', 'userRating']], reader)

In [26]:
# Define the models
model_SVD = SVD(
    n_epochs=option_SVD['n_epochs'], lr_all=option_SVD['lr_all'], reg_all=option_SVD['reg_all'],
)

model_KNNwithMeans = KNNWithMeans( k=options_KNNwithMeans['k'], 
    sim_options=options_KNNwithMeans['sim_options'], bsl_options=options_KNNwithMeans['bsl_options']
)

model_KNNBasic = KNNBasic(k=option_KNNBasic['k'], 
    sim_options=option_KNNBasic['sim_options'], bsl_options=option_KNNBasic['bsl_options'],
)

model_KNNWithZScore = KNNWithZScore(k=option_KNNWithZScore['k'], 
    sim_options=option_KNNWithZScore['sim_options'], bsl_options=option_KNNWithZScore['bsl_options'],
)



In [27]:
# Perform cross-validation for SVD
try:
    results_SVD = cross_validate(model_SVD, data, measures=metrics, cv=fold, verbose=True)
except (MemoryError):
    results_SVD = {'test_rmse': [0]*fold, 'test_mae': [0]*fold,
                            'fit_time': [0]*fold, 'test_time': [0]*fold}
    print("ใช้ Memory เกิน")


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9299  0.9295  0.9270  0.9277  0.9260  0.9280  0.0015  
MAE (testset)     0.7444  0.7428  0.7417  0.7431  0.7422  0.7429  0.0009  
Fit time          19.80   18.53   20.71   23.18   21.51   20.75   1.57    
Test time         2.08    2.28    2.08    2.39    2.19    2.21    0.12    


In [28]:
# Perform cross-validation for KNN with Means
try:
    results_KNNwithMeans = cross_validate(model_KNNwithMeans, data, measures=metrics, cv=fold, verbose=True)
except (MemoryError):
    results_KNNwithMeans = {'test_rmse': [0]*fold, 'test_mae': [0]*fold,
                            'fit_time': [0]*fold, 'test_time': [0]*fold}
    print("ใช้ Memory เกิน")
    

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9217  0.9175  0.9168  0.9189  0.9169  0.9184  0.0018  
MAE (testset)     0.7256  0.7215  0.7231  0.7240  0.7225  0.7234  0.0014  
Fit time          104.29  101.38  106.10  102.58  104.08  103.69  1.61    
Test time         119.86  128.17  134.53  126.43  116.31  125.06  6.40    


In [29]:
# Perform cross-validation for KNN Basic
try:
    results_KNNBasic = cross_validate(model_KNNBasic, data, measures=metrics, cv=fold, verbose=True)
except (MemoryError):
    results_KNNBasic = {'test_rmse': [0]*fold, 'test_mae': [0]*fold,
                            'fit_time': [0]*fold, 'test_time': [0]*fold}
    print("ใช้ Memory เกิน")


Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9843  0.9822  0.9856  0.9850  0.9863  0.9847  0.0014  
MAE (testset)     0.7896  0.7874  0.7897  0.7900  0.7906  0.7895  0.0011  
Fit time          52.87   46.41   51.85   51.49   43.63   49.25   3.59    
Test time         87.77   90.33   93.68   79.49   75.87   85.43   6.70    


In [30]:
# Perform cross-validation for KNN with Z-Score
try:
    results_KNNWithZScore = cross_validate(model_KNNWithZScore, data, measures=metrics, cv=fold, verbose=True)
except (MemoryError):
    results_KNNWithZScore = {'test_rmse': [0]*fold, 'test_mae': [0]*fold,
                            'fit_time': [0]*fold, 'test_time': [0]*fold}
    print("ใช้ Memory เกิน")


Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9161  0.9178  0.9143  0.9139  0.9159  0.9156  0.0014  
MAE (testset)     0.7194  0.7215  0.7194  0.7181  0.7189  0.7194  0.0011  
Fit time          94.16   129.65  101.28  88.41   112.00  105.10  14.58   
Test time         126.10  130.98  107.65  121.57  133.45  123.95  9.12    


In [31]:
import numpy as np

def Avg(lis):
    return np.average(lis)

def Std(lis):
    return np.std(lis)

In [32]:
def change2DataFrame(results, name):
    print(name+':')
    dic = { 
        '_' :['Fold %d'%x for x in range(1,fold+1)],
        'RMSE' :results['test_rmse'],
        'MAE' :results['test_mae'],
        'Fit time':(results['fit_time']),
        'Test time':(results['test_time'])
    }
        
    df1 = pd.DataFrame(dic,index=range(1,fold+1), columns=['_','RMSE', 'MAE', 'Fit time', 'Test time'])
    new_dict = {
        '_' :['Mean', 'Std'],
        'RMSE' :[df1['RMSE'].mean(), df1['RMSE'].std()],
        'MAE' :[df1['MAE'].mean(), df1['MAE'].std()],
        'Fit time':[Avg(df1['Fit time']), Std(df1['Fit time'])],
        'Test time':[Avg(df1['Test time']), Std(df1['Test time'])]
    }
    df2 = pd.DataFrame(new_dict, index=[fold, fold+1], columns=['_','RMSE', 'MAE', 'Fit time', 'Test time'])
    df_results = pd.concat([df1, df2], axis=0)
    return df_results


In [33]:
df_Basic = change2DataFrame(results_KNNBasic, 'KNNBasic Results')
df_Basic

KNNBasic Results:


Unnamed: 0,_,RMSE,MAE,Fit time,Test time
1,Fold 1,0.984328,0.789638,52.872709,87.770506
2,Fold 2,0.982237,0.787369,46.410453,90.325169
3,Fold 3,0.985621,0.789686,51.854888,93.683055
4,Fold 4,0.984973,0.790038,51.486389,79.493599
5,Fold 5,0.986278,0.790635,43.633074,75.869641
5,Mean,0.984687,0.789473,49.251502,85.428394
6,Std,0.001551,0.001242,3.592319,6.696191


In [34]:
df_ZScore = change2DataFrame(results_KNNWithZScore, 'KNNWithZScore Results')
df_ZScore

KNNWithZScore Results:


Unnamed: 0,_,RMSE,MAE,Fit time,Test time
1,Fold 1,0.916058,0.719377,94.163697,126.098214
2,Fold 2,0.917818,0.721491,129.64993,130.980985
3,Fold 3,0.914274,0.71941,101.278562,107.651684
4,Fold 4,0.913916,0.718061,88.407785,121.5738
5,Fold 5,0.915869,0.71891,111.996676,133.449894
5,Mean,0.915587,0.71945,105.09933,123.950916
6,Std,0.001565,0.001264,14.581491,9.116787


In [35]:
df_Mean = change2DataFrame(results_KNNwithMeans, 'KNNwithMeans Results')
df_Mean

KNNwithMeans Results:


Unnamed: 0,_,RMSE,MAE,Fit time,Test time
1,Fold 1,0.921702,0.725616,104.289695,119.86244
2,Fold 2,0.917484,0.72151,101.384485,128.166592
3,Fold 3,0.916792,0.723117,106.103628,134.53184
4,Fold 4,0.918905,0.724044,102.581015,126.434387
5,Fold 5,0.91691,0.722494,104.082739,116.313753
5,Mean,0.918359,0.723356,103.688312,125.061802
6,Std,0.002049,0.001564,1.605286,6.399485


In [36]:
df_SVD = change2DataFrame(results_SVD, 'SVD Results')
df_SVD

SVD Results:


Unnamed: 0,_,RMSE,MAE,Fit time,Test time
1,Fold 1,0.929901,0.744415,19.802861,2.076439
2,Fold 2,0.929509,0.742836,18.526754,2.283187
3,Fold 3,0.926971,0.741708,20.707201,2.08317
4,Fold 4,0.927699,0.743142,23.183814,2.391488
5,Fold 5,0.92601,0.742162,21.512458,2.191016
5,Mean,0.928018,0.742853,20.746618,2.20506
6,Std,0.001658,0.001038,1.571398,0.12038


Save Result to xlsx

In [37]:
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

In [38]:
import pandas as pd

path = 'D:\\Coding\\Machine_Learning\\Recommendation_System\\Results_Comparing\\'
file_name = path + f'Count_Rating_{human_format(count_rating)}' + '.xlsx'

# Export DataFrames to Excel using ExcelWriter 
with pd.ExcelWriter(file_name) as excel_writer:
    df_Basic.to_excel(excel_writer, sheet_name='KNN Basic', index=False)
    df_ZScore.to_excel(excel_writer, sheet_name='KNN With Z-Score', index=False)
    df_Mean.to_excel(excel_writer, sheet_name='KNN with Means', index=False)
    df_SVD.to_excel(excel_writer, sheet_name='SVD', index=False)