In [1]:
import os
import pandas as pd
from surprise import BaselineOnly, Dataset, Reader
from surprise import accuracy, SVD
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

data_path = os.path.expanduser("~/Desktop/CIL/Project/Data/train_ratings.csv")
sub_path = os.path.expanduser("~/Desktop/CIL/Project/Data/sample_submission.csv")

In [2]:
# Load Data
ratings = pd.read_csv(data_path)
ratings[["sid", "pid"]] = ratings["sid_pid"].str.split("_", expand=True)
ratings.drop(columns=["sid_pid"], inplace=True)
ratings

Unnamed: 0,rating,sid,pid
0,2,0,0
1,3,0,11
2,5,0,13
3,5,0,14
4,4,0,18
...,...,...,...
1128182,5,9999,953
1128183,5,9999,955
1128184,5,9999,970
1128185,5,9999,986


In [3]:
# Instantiate a Surprise Dataset
reader = Reader(rating_scale=(1,5))
dataset = Dataset.load_from_df(ratings[["sid", "pid", "rating"]], reader)


In [4]:

# a cross validation iterator
k_fold = KFold(n_splits=5)

# Model
model = SVD


param_grid = {"n_epochs": [10, 20, 50], "lr_all": [0.002, 0.005, 0.01], "reg_all": [0.1, 0.2, 0.4, 0.6]}
gs = GridSearchCV(model, param_grid, measures=["rmse"], cv=5, n_jobs=-1)

gs.fit(dataset)

# best RMSE score
print("best rmse score:",gs.best_score["rmse"])
print("best params:",gs.best_params["rmse"])



best rmse score: 0.8617198720389627
best params: {'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}


In [5]:
# Retrain over the whole data
best_model = gs.best_estimator["rmse"]
best_model.fit(dataset.build_full_trainset())



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2ba139eb90>

In [10]:
# 
submission_test = pd.read_csv(sub_path)
submission_test[["sid", "pid"]] = submission_test["sid_pid"].str.split("_", expand=True)

testset = list(zip(submission_test['sid'], submission_test['pid'], submission_test['rating']))

# predict
predictions = best_model.test(testset)

# Extract the predicted ratings
submission_test['rating'] = [pred.est for pred in predictions]
submission_test.drop(columns=["sid", "pid"], inplace=True)
submission_test

Unnamed: 0,sid_pid,rating
0,0_2,3.622898
1,0_6,4.026225
2,0_10,4.166465
3,0_16,4.547399
4,0_26,3.749310
...,...,...
1128182,9999_910,4.216586
1128183,9999_934,4.261009
1128184,9999_946,4.222524
1128185,9999_969,4.515935


In [13]:
# Save
output_path = os.path.expanduser("~/Desktop/CIL/Project/Data/submission_1.csv")
submission_test.to_csv(output_path, index=False)