In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from Modules.dataEngineering import dataEngineering
dataEngineering = dataEngineering()

In [2]:
df_ratings        = dataEngineering.loadRatings()['data']

In [3]:
from surprise import Dataset, KNNWithZScore, Reader
from surprise.model_selection import GridSearchCV

# Use movielens-100K
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['UserID', 'MovieID', 'userRating']], reader)

sim_options = {
    'name': ['cosine', 'pearson'],
    'user_based': [False, True],
}
bsl_options = {
    'method': ['sgd'],
    'learning_rate': [0.0005, 0.005]
}
param_grid = {
    'k' : [40],
    'sim_options': sim_options,
    'bsl_options': bsl_options
}

# สร้างตัว GridSearchCV สำหรับการค้นหาพารามิเตอร์
# gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse', 'mae'], cv=5)

# gs.fit(data)


In [4]:
# gs.best_score

{'rmse': 0.8927054151336782, 'mae': 0.6742019453132718}

In [5]:
# gs.best_params      

In [6]:
output = {
    'rmse': { 
        'k': 40,
        'sim_options': {'name': 'pearson', 'user_based': True},
        'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}},
    'mae': {
        'k': 40,
        'sim_options': {'name': 'pearson', 'user_based': True},
          'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}}
}

In [7]:
option_KNNWithZScore = {
    'k': 40,
    'sim_options': {'name': 'pearson', 'user_based': True},
    'bsl_options': {'method': 'sgd', 'learning_rate': 0.0005}
}

In [8]:
from surprise.model_selection import train_test_split
from surprise import accuracy

In [9]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

model_KNNWithZScore = KNNWithZScore(k=40, sim_options=option_KNNWithZScore['sim_options'], bsl_options=option_KNNWithZScore['bsl_options'],)

model_KNNWithZScore.fit(train_set)

predictions = model_KNNWithZScore.test(test_set)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8932
MAE:  0.6708


In [10]:
def predict_model(pivot):
    matrix = pivot.copy()
    u_col = 0
    ratings = []
    for u_id, row in matrix.iterrows():
        for m_col, item in enumerate(row):
            m_id = matrix.columns[m_col]
            if pd.isnull(item):
                prep_rating = round(model_KNNWithZScore.predict(u_id, m_id).est, 4)
                ratings.append([u_id, m_id, prep_rating])
                matrix.iloc[u_col][m_id] = prep_rating
        u_col += 1
    
    recomment_unwatched = pd.DataFrame(ratings, columns=['UserID', 'MovieID', 'prep_Rating'])
    return matrix, recomment_unwatched.sort_values(by=['prep_Rating'], ascending=False)

In [11]:
df_pivot = df_ratings.pivot(index='UserID', columns='MovieID' , values='userRating')

In [12]:
mat, prep = predict_model(df_pivot.iloc[1:2])

In [13]:
mat

MovieID,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,4.4733,4.0661,3.7473,4.02,3.4166,4.3658,3.3899,3.7846,4.02,4.2609,...,3.8573,3.5018,4.2087,4.2087,3.8573,4.2087,3.8573,3.8573,3.8573,3.5018


In [14]:
prep.head(10)

Unnamed: 0,UserID,MovieID,prep_Rating
2205,2,2936,5.0
7035,2,69524,5.0
5043,2,7926,5.0
2183,2,2905,5.0
5050,2,7939,5.0
2140,2,2857,5.0
5128,2,8261,5.0
5187,2,8477,5.0
5221,2,8580,5.0
2011,2,2686,5.0


Save Model

In [15]:
import joblib

joblib.dump(model_KNNWithZScore, 'models/model_KNNWithZScore.pkl')

['models/model_KNNWithZScore.pkl']