In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from Modules.dataEngineering import dataEngineering
dataEngineering = dataEngineering()

In [2]:
df_ratings        = dataEngineering.loadRatings()['data']
df_ratings.shape

(2247560, 3)

In [7]:
from surprise import Dataset, KNNBasic, Reader
from surprise.model_selection import GridSearchCV

# Use movielens
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['UserID', 'MovieID', 'userRating']], reader)

In [4]:
sim_options = {
    'name': ['cosine', 'pearson'],
    'user_based': [False, True],
}
bsl_options = {
    'method': ['sgd'],
    'learning_rate': [0.0005, 0.005]
}
param_grid = {
    'k' : [40, 60],
    'sim_options': sim_options,
    'bsl_options': bsl_options
}

# สร้างตัว GridSearchCV สำหรับการค้นหาพารามิเตอร์
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)

gs.fit(data)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

เวลาที่ใช้ในการหาค่า Parameter: 1000m+

In [5]:
display(gs.best_score, gs.best_params)

{'rmse': 0.8058334145566141, 'mae': 0.6096313249058015}

{'rmse': {'k': 40,
  'sim_options': {'name': 'pearson', 'user_based': False},
  'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}},
 'mae': {'k': 40,
  'sim_options': {'name': 'pearson', 'user_based': False},
  'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}}}

In [28]:
option_KNNBasic = gs.best_params['rmse'].copy()
option_KNNBasic

{'k': 40,
 'sim_options': {'name': 'pearson', 'user_based': False},
 'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}}

In [3]:
from surprise.model_selection import train_test_split
from surprise import accuracy, KNNBasic

In [8]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

model_KNNBasic = KNNBasic(k=option_KNNBasic['k'], sim_options=option_KNNBasic['sim_options'], bsl_options=option_KNNBasic['bsl_options'],)

model_KNNBasic.fit(train_set)

predictions = model_KNNBasic.test(test_set)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8055
MAE:  0.6095


In [9]:
def predict_model(pivot):
    matrix = pivot.copy()
    u_col = 0
    ratings = []
    for u_id, row in matrix.iterrows():
        for m_col, item in enumerate(row):
            m_id = matrix.columns[m_col]
            if pd.isnull(item):
                prep_rating = round(model_KNNBasic.predict(u_id, m_id).est, 4)
                ratings.append([u_id, m_id, prep_rating])
                matrix.iloc[u_col][m_id] = prep_rating
        u_col += 1
    
    recomment_unwatched = pd.DataFrame(ratings, columns=['UserID', 'MovieID', 'prep_Rating'])
    return matrix, recomment_unwatched.sort_values(by=['prep_Rating'], ascending=False)

In [10]:
df_pivot = df_ratings.pivot(index='UserID', columns='MovieID' , values='userRating')

In [11]:
mat, prep = predict_model(df_pivot.iloc[1:2])

In [12]:
mat

MovieID,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,3.0,3.0809,3.2041,3.377,3.2468,3.9273,3.2609,3.0154,3.2272,3.3335,...,2.631,3.5237,4.0,3.8349,3.4657,2.5791,2.7001,3.1027,5.0,2.9218


In [13]:
prep.head(5)

Unnamed: 0,UserID,MovieID,prep_Rating
2105,4,4848,4.1988
1181,4,2288,4.1717
3387,4,112183,4.1691
3549,4,183869,4.1615
3368,4,108729,4.1475


Save Model

In [14]:
import joblib

joblib.dump(model_KNNBasic, 'models/model_KNNBasic.pkl')

['models/model_KNNBasic.pkl']

Save Parameter

In [15]:
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

In [None]:
import json
import os

folder = human_format(df_ratings.shape[0])

# Create directory if directory exists.
os.makedirs(f'./Parameter/{folder}/', exist_ok=True) 

with open(f'./Parameter/option_KNNBasic.json', 'w') as f:
    json.dump(option_KNNBasic, f, indent = 6)

with open(f'./Parameter/{folder}/option_KNNBasic.json', 'w') as f:
    json.dump(option_KNNBasic, f, indent = 6)