In [7]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from Modules.dataEngineering import dataEngineering
dataEngineering = dataEngineering()

In [8]:
df_ratings        = dataEngineering.loadRatings()['data']
df_ratings.shape

(2247560, 3)

In [9]:
from surprise import Dataset, KNNWithZScore, Reader
from surprise.model_selection import GridSearchCV

# Use movielens-100K
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['UserID', 'MovieID', 'userRating']], reader)

In [4]:
sim_options = {
    'name': ['cosine', 'pearson'],
    'user_based': [False, True],
}
bsl_options = {
    'method': ['sgd'],
    'learning_rate': [0.0005, 0.005]
}
param_grid = {
    'k' : [20, 60],
    'sim_options': sim_options,
    'bsl_options': bsl_options
}

# สร้างตัว GridSearchCV สำหรับการค้นหาพารามิเตอร์
gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing t

681m 19.9s

In [5]:
display(gs.best_score, gs.best_params)

{'rmse': 0.7741590459509089, 'mae': 0.5728705028015949}

{'rmse': {'k': 60,
  'sim_options': {'name': 'pearson', 'user_based': False},
  'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}},
 'mae': {'k': 20,
  'sim_options': {'name': 'pearson', 'user_based': False},
  'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}}}

In [6]:
option_KNNWithZScore = gs.best_params['rmse']
option_KNNWithZScore

{'k': 60,
 'sim_options': {'name': 'pearson', 'user_based': False},
 'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}}

In [11]:
from surprise.model_selection import train_test_split
from surprise import accuracy

In [12]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

model_KNNWithZScore = KNNWithZScore(k=40, sim_options=option_KNNWithZScore['sim_options'], bsl_options=option_KNNWithZScore['bsl_options'],)

model_KNNWithZScore.fit(train_set)

predictions = model_KNNWithZScore.test(test_set)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.7635
MAE:  0.5646


In [13]:
def predict_model(pivot):
    matrix = pivot.copy()
    u_col = 0
    ratings = []
    for u_id, row in matrix.iterrows():
        for m_col, item in enumerate(row):
            m_id = matrix.columns[m_col]
            if pd.isnull(item):
                prep_rating = round(model_KNNWithZScore.predict(u_id, m_id).est, 4)
                ratings.append([u_id, m_id, prep_rating])
                matrix.iloc[u_col][m_id] = prep_rating
        u_col += 1
    
    recomment_unwatched = pd.DataFrame(ratings, columns=['UserID', 'MovieID', 'prep_Rating'])
    return matrix, recomment_unwatched.sort_values(by=['prep_Rating'], ascending=False)

In [14]:
df_pivot = df_ratings.pivot(index='UserID', columns='MovieID' , values='userRating')

In [15]:
mat, prep = predict_model(df_pivot.iloc[1:2])

In [16]:
mat

MovieID,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,3.0,3.1325,2.8048,2.0316,2.6317,4.0836,2.9712,2.5843,2.3478,3.0486,...,2.8745,3.7632,4.0,3.8286,3.4638,2.6322,3.0643,3.6144,5.0,3.2295


In [17]:
prep.head(10)

Unnamed: 0,UserID,MovieID,prep_Rating
1517,4,2959,4.496
1532,4,3000,4.4712
2255,4,5690,4.4496
2625,4,27773,4.4155
3533,4,170705,4.3958
650,4,1203,4.3836
3518,4,166024,4.3786
41,4,47,4.3652
2491,4,7361,4.3481
1181,4,2288,4.3466


Save Model

In [18]:
import joblib

joblib.dump(model_KNNWithZScore, 'models/model_KNNWithZScore.pkl')

['models/model_KNNWithZScore.pkl']

Save Parameter

In [19]:
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

In [20]:
import json
import os

folder = human_format(df_ratings.shape[0])

# Create directory if directory exists.
os.makedirs(f'./Parameter/{folder}/', exist_ok=True) 

with open(f'./Parameter/option_KNNWithZScore.json', 'w') as f:
    json.dump(option_KNNWithZScore, f, indent = 6)

with open(f'./Parameter/{folder}/option_KNNWithZScore.json', 'w') as f:
    json.dump(option_KNNWithZScore, f, indent = 6)