In [23]:
from pyspark.sql import SparkSession
from surprise import Reader
from surprise import Dataset

In [24]:
spark = SparkSession.builder.appName("Best Model for Rec").getOrCreate()

In [25]:
df_product = spark.read.json('data/product.json')
product_rating = spark.read.json('data/rating.json')

In [26]:
merged_df = product_rating.join(df_product, 'productId', 'inner')

In [27]:
df_train, df_test = merged_df.randomSplit([0.7, 0.3], seed = 96)
df_train_pandas = df_train.toPandas()
df_test_pandas = df_test.toPandas()

In [28]:
reader = Reader(rating_scale=(1, 5))

In [29]:
data_train = Dataset.load_from_df(df_train_pandas[['userId', 'productId', 'rating']], reader)
data_test = Dataset.load_from_df(df_test_pandas[['userId', 'productId', 'rating']], reader)

In [30]:
trainset = data_train.build_full_trainset()
testset = data_test.build_full_trainset().build_testset()

In [31]:
from surprise.dump import load, dump

In [32]:
dump_path = 'model/knn_model.pkl'
knn_algo = load(dump_path)[1]

In [33]:
dump_path = 'model/svd_model.pkl'
svd_algo = load(dump_path)[1]

In [34]:
dump_path = 'model/svdpp_model.pkl'
svdpp_algo = load(dump_path)[1]

In [35]:
dump_path = 'model/nmf_model.pkl'
nmf_algo = load(dump_path)[1]

In [36]:
dump_path = 'model/clustering_model.pkl'
clustering_algo = load(dump_path)[1]

In [37]:
# Make predictions using the knn model
knn_predictions = knn_algo.test(testset)

In [38]:
# Make predictions using the svd model
svd_predictions = svd_algo.test(testset)

In [39]:
# Make predictions using the svdpp model
svdpp_predictions = svdpp_algo.test(testset)

In [40]:
# Make predictions using the nmf model
nmf_predictions = nmf_algo.test(testset)

In [41]:
# Make predictions using the clustering model
clustering_predictions = clustering_algo.test(testset)

In [42]:
from surprise import accuracy

In [43]:
# Calculate RMSE (Root Mean Squared Error)
print('KNN')
knn_rmse = accuracy.rmse(knn_predictions)
print('SVD')
svd_rmse = accuracy.rmse(svd_predictions)
print('SVDpp')
svdpp_rmse = accuracy.rmse(svdpp_predictions)
print('NMF')
nmf_rmse = accuracy.rmse(nmf_predictions)
print('Clustering')
clustering_rmse = accuracy.rmse(clustering_predictions)

KNN
RMSE: 1.4357
SVD
RMSE: 1.4203
SVDpp
RMSE: 1.4207
NMF
RMSE: 1.4613
Clustering
RMSE: 1.4420


In [44]:
dump_path_save = 'model/best_model.pkl'
dump(dump_path_save, algo=svdpp_predictions)