Reference: https://surpriselib.com/

In [1]:
import os

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset, Reader
from surprise import accuracy

import random

import pickle

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
df_ratings = pd.read_csv(os.path.join('data', 'prepared_ratings.csv'))
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062183 entries, 0 to 18062182
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   userId  int64  
 2   rating  float64
 3   date    object 
dtypes: float64(1), int64(2), object(1)
memory usage: 551.2+ MB


In [3]:
best_algo_path = os.path.join('data', 'models', 'best_SVD_model.pkl')
base_algo_path =  os.path.join('data', 'models', 'base_SVD_model.pkl')
with open(best_algo_path, 'rb') as f:
    best_algo = pickle.load(f)

with open(base_algo_path, 'rb') as f:
    base_algo = pickle.load(f)

1. Select Specific Columns Only

In [4]:
df_ratings = df_ratings[['id', 'userId', 'rating']]
df_ratings.head()

Unnamed: 0,id,userId,rating
0,862,8,4.0
1,862,9,4.5
2,862,12,4.0
3,862,20,4.0
4,862,24,4.0


2. Create Surprise Dataset

In [5]:
# Define the rating scale for Surpise
min_rating = df_ratings['rating'].min()
max_rating = df_ratings['rating'].max()
reader = Reader(rating_scale=(min_rating, max_rating))

# Load data from the filtered pandas DataFrame
data = Dataset.load_from_df(df_ratings[['userId', 'id', 'rating']], reader)

3. Split Dataset

In [6]:
RANDOM_STATE = 42
TRAIN_SPLIT_SIZE = 0.80 # 80% for the training set

In [7]:
raw_ratings = data.raw_ratings
random.seed(RANDOM_STATE)  # Initialize the random number generator
random.shuffle(raw_ratings)  # Shuffle the raw_ratings list
threshold = int(TRAIN_SPLIT_SIZE * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = A_raw_ratings  # data is now the set A

4. Evaluation

In [8]:
trainset = data.build_full_trainset()

In [9]:
testset = data.construct_testset(B_raw_ratings)  
# trainset_test = trainset.build_testset()

In [10]:
# train_best_predictions = best_algo.test(trainset_test)
# train_best_rmse = accuracy.rmse(train_best_predictions)
# train_best_mae = accuracy.mae(train_best_predictions)
# train_best_fcp = accuracy.fcp(train_best_predictions)

In [11]:
# train_base_predictions = base_algo.test(trainset_test)
# train_base_rmse = accuracy.rmse(train_base_predictions)
# train_base_mae = accuracy.mae(train_base_predictions)
# train_base_fcp = accuracy.fcp(train_base_predictions)

In [12]:
test_best_predictions = best_algo.test(testset)
test_best_rmse = accuracy.rmse(test_best_predictions)
test_best_mae = accuracy.mae(test_best_predictions)
test_best_fcp = accuracy.fcp(test_best_predictions)

RMSE: 0.8177
MAE:  0.6199
FCP:  0.7182


In [13]:
test_base_predictions = base_algo.test(testset)
test_base_rmse = accuracy.rmse(test_base_predictions)
test_base_mae = accuracy.mae(test_base_predictions)
test_base_fcp = accuracy.fcp(test_base_predictions)

RMSE: 0.8089
MAE:  0.6119
FCP:  0.7265


In [14]:
# # Print evaluation metrics for the best model on the training set
# print("Best Model - Training Set:")
# print(f"RMSE: {train_best_rmse:.4f}")
# print(f"MAE: {train_best_mae:.4f}")
# print(f"FCP: {train_best_fcp:.4f}")
# print()

# # Print evaluation metrics for the base model on the training set
# print("Base Model - Training Set:")
# print(f"RMSE: {train_base_rmse:.4f}")
# print(f"MAE: {train_base_mae:.4f}")
# print(f"FCP: {train_base_fcp:.4f}")
# print()

# Print evaluation metrics for the best model on the test set
print("Best Model - Test Set:")
print(f"RMSE: {test_best_rmse:.4f}")
print(f"MAE: {test_best_mae:.4f}")
print(f"FCP: {test_best_fcp:.4f}")
print()

# Print evaluation metrics for the base model on the test set
print("Base Model - Test Set:")
print(f"RMSE: {test_base_rmse:.4f}")
print(f"MAE: {test_base_mae:.4f}")
print(f"FCP: {test_base_fcp:.4f}")

Best Model - Test Set:
RMSE: 0.8177
MAE: 0.6199
FCP: 0.7182

Base Model - Test Set:
RMSE: 0.8089
MAE: 0.6119
FCP: 0.7265


5. Sample Prediction

In [15]:
movie_id_example = df_ratings['id'].iloc[0] # Use first valid movieId
user_id_example = df_ratings['userId'].iloc[0] # Use first valid userId
predicted_rating = best_algo.predict(user_id_example, movie_id_example)

print(f"\nPredicted rating for User ID {user_id_example} on ID {movie_id_example}: {predicted_rating.est:.2f}")


Predicted rating for User ID 8 on ID 862: 3.25


In [16]:
df_ratings['rating'].iloc[0]

4.0