In [7]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from Modules.dataEngineering import dataEngineering
dataEngineering = dataEngineering()

In [8]:
df_ratings        = dataEngineering.loadRatings()['data']
df_ratings.shape[0]

2247560

In [11]:
from surprise import Dataset, SVD, Reader
from surprise.model_selection import GridSearchCV

# Use movielens-100K
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['UserID', 'MovieID', 'userRating']], reader)

param_grid = {
    "n_epochs": [10, 30], 
    "lr_all": [0.0005, 0.005], 
    "reg_all": [0.4, 0.6]
}

# สร้างตัว GridSearchCV สำหรับการค้นหาพารามิเตอร์
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(data)

เวลาที่ใช้ในการหาค่า Parameter: 30m

In [11]:
display(gs.best_score, gs.best_params)

{'rmse': 0.8311952204703804, 'mae': 0.6283979309761831}

{'rmse': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4},
 'mae': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}}

In [1]:
option_SVD = gs.best_params['rmse']
option_SVD

In [12]:
from surprise.model_selection import train_test_split
from surprise import accuracy

In [13]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

model_SVD = SVD(n_epochs=option_SVD['n_epochs'], lr_all=option_SVD['lr_all'], reg_all=option_SVD['reg_all'],)

model_SVD.fit(train_set)

predictions = model_SVD.test(test_set)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.8310
MAE:  0.6280


In [14]:
def predict_model(pivot):
    matrix = pivot.copy()
    u_col = 0
    ratings = []
    for u_id, row in matrix.iterrows():
        for m_col, item in enumerate(row):
            m_id = matrix.columns[m_col]
            if pd.isnull(item):
                prep_rating = round(model_SVD.predict(u_id, m_id).est, 4)
                ratings.append([u_id, m_id, prep_rating])
                matrix.iloc[u_col][m_id] = prep_rating
        u_col += 1
    
    recomment_unwatched = pd.DataFrame(ratings, columns=['UserID', 'MovieID', 'prep_Rating'])
    return matrix, recomment_unwatched.sort_values(by=['prep_Rating'], ascending=False)

In [15]:
df_pivot = df_ratings.pivot(index='UserID', columns='MovieID' , values='userRating')

In [16]:
mat, prep = predict_model(df_pivot.iloc[1:2])

In [17]:
mat

MovieID,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,3.0,3.3341,3.1331,2.813,3.0471,3.7369,3.3041,3.1503,2.9131,3.3166,...,3.4204,3.5288,4.0,3.4761,3.4571,3.2049,3.4732,3.5652,5.0,3.4423


In [18]:
prep[prep["UserID"] == 4].head(10)

Unnamed: 0,UserID,MovieID,prep_Rating
3533,4,170705,4.0555
1496,4,2920,3.9139
1466,4,2859,3.9131
1532,4,3000,3.9116
650,4,1203,3.9102
1966,4,4144,3.909
208,4,318,3.9036
2768,4,44555,3.8996
691,4,1251,3.8967
635,4,1178,3.8949


Save Model

In [19]:
import joblib

joblib.dump(model_SVD, 'models/model_SVD.pkl')

['models/model_SVD.pkl']

Save Parameter

In [20]:
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

In [None]:
import json
import os

folder = {human_format(df_ratings.shape[0])}

# Create directory if directory exists.
os.makedirs(f'./Parameter/{folder}/', exist_ok=True) 

with open(f'./Parameter/option_SVD.json', 'w') as f:
    json.dump(option_KNNBasic, f, indent = 6)

with open(f'./Parameter/{folder}/option_SVD.json', 'w') as f:
    json.dump(option_KNNBasic, f, indent = 6)