In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from Modules.dataEngineering import dataEngineering
dataEngineering = dataEngineering()

In [2]:
df_ratings        = dataEngineering.loadRatings()['data']

In [10]:
from surprise import Dataset, SVD, Reader
from surprise.model_selection import GridSearchCV

# Use movielens-100K
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['UserID', 'MovieID', 'userRating']], reader)

param_grid = {
    "n_epochs": [10, 30], 
    "lr_all": [0.0005, 0.005], 
    "reg_all": [0.4, 0.6]
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(data)

In [11]:
display(gs.best_score, gs.best_params)

{'rmse': 0.8311952204703804, 'mae': 0.6283979309761831}

{'rmse': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4},
 'mae': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}}

{'rmse': 0.8311952204703804, 'mae': 0.6283979309761831}

In [22]:
output = {
    'rmse': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4},
    'mae': {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
}

In [23]:
option_SVD = {
    'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4
}

In [24]:
from surprise.model_selection import train_test_split
from surprise import accuracy

In [25]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

model_SVD = SVD(n_epochs=option_SVD['n_epochs'], lr_all=option_SVD['lr_all'], reg_all=option_SVD['reg_all'],)

model_SVD.fit(train_set)

predictions = model_SVD.test(test_set)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.8310
MAE:  0.6280


In [26]:
def predict_model(pivot):
    matrix = pivot.copy()
    u_col = 0
    ratings = []
    for u_id, row in matrix.iterrows():
        for m_col, item in enumerate(row):
            m_id = matrix.columns[m_col]
            if pd.isnull(item):
                prep_rating = round(model_SVD.predict(u_id, m_id).est, 4)
                ratings.append([u_id, m_id, prep_rating])
                matrix.iloc[u_col][m_id] = prep_rating
        u_col += 1
    
    recomment_unwatched = pd.DataFrame(ratings, columns=['UserID', 'MovieID', 'prep_Rating'])
    return matrix, recomment_unwatched.sort_values(by=['prep_Rating'], ascending=False)

In [27]:
df_pivot = df_ratings.pivot(index='UserID', columns='MovieID' , values='userRating')

In [28]:
mat, prep = predict_model(df_pivot.iloc[1:2])

In [29]:
mat

MovieID,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,3.0,3.3341,3.1336,2.8127,3.0473,3.7369,3.3042,3.1491,2.9133,3.3165,...,3.4204,3.5287,4.0,3.4763,3.4571,3.2048,3.4734,3.5652,5.0,3.4423


In [30]:
prep[prep["UserID"] == 2].head(10)

Unnamed: 0,UserID,MovieID,prep_Rating


Save Model

In [31]:
import joblib

joblib.dump(model_SVD, 'models/model_SVD.pkl')

['models/model_SVD.pkl']