In [3]:
from datetime import datetime
import pandas as pd
import surprise

from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
)
from recommenders.models.surprise.surprise_utils import predict

In [4]:
ratings = pd.read_json("ratings.jsonl", lines=True)
ratings = ratings[["UserId", "ItemId", "Rating"]]
content = pd.read_json("content.jsonl", lines=True)

user_indexes = {user_id: index for index, user_id in enumerate(ratings["UserId"].unique())}
reversed_user_indexes = {index: user_id for user_id, index in user_indexes.items()}
item_indexes = {item_id: index for index, item_id in enumerate(content["ItemId"].unique())}
reversed_item_indexes = {index: item_id for item_id, index in item_indexes.items()}

ratings = ratings.assign(UserIndex=ratings["UserId"].map(user_indexes))
ratings = ratings.assign(ItemIndex=ratings["ItemId"].map(item_indexes))

In [5]:
train, test = python_random_split(ratings, 0.75)

In [6]:
train_set = surprise.Dataset.load_from_df(
    train[["UserIndex", "ItemIndex", "Rating"]],
    reader=surprise.Reader(rating_scale=(1, 10)),
    ).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x104a44040>

In [7]:
svd = surprise.SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.02)
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13f88eec0>

In [8]:
predictions = predict(svd, test, usercol="UserIndex", itemcol="ItemIndex")
predictions

Unnamed: 0,UserIndex,ItemIndex,prediction
0,24459,19299,4.460061
1,5767,23922,6.940709
2,51409,25866,6.736967
3,2693,22257,6.429496
4,19816,13500,8.249172
...,...,...,...
164925,12047,24523,6.674327
164926,13505,10096,7.266627
164927,6165,24790,7.430336
164928,16119,19161,5.203542


In [9]:
eval_rmse = rmse(rating_true=test, rating_pred=predictions, col_user="UserIndex", col_item="ItemIndex", col_rating="Rating")
eval_mae = mae(rating_true=test, rating_pred=predictions, col_user="UserIndex", col_item="ItemIndex", col_rating="Rating")
eval_rsquared = rsquared(rating_true=test, rating_pred=predictions, col_user="UserIndex", col_item="ItemIndex", col_rating="Rating")
eval_exp_var = exp_var(rating_true=test, rating_pred=predictions, col_user="UserIndex", col_item="ItemIndex", col_rating="Rating")

print(f"RMSE: {eval_rmse}")
print(f"MAE: {eval_mae}")
print(f"R^2: {eval_rsquared}")
print(f"Explained variance: {eval_exp_var}")

RMSE: 1.500394164952067
MAE: 1.0994929532634332
R^2: 0.3473552565839232
Explained variance: 0.34744869406063583


In [10]:
targets = pd.read_csv("targets.csv")
targets = targets.assign(UserIndex=targets["UserId"].map(user_indexes))
targets = targets.assign(ItemIndex=targets["ItemId"].map(item_indexes))
targets

Unnamed: 0,UserId,ItemId,UserIndex,ItemIndex
0,0006246bee,01d2404d4c,31471,21530
1,0006246bee,03d43fdf92,31471,5449
2,0006246bee,0808a9666b,31471,24104
3,0006246bee,0a5d7dd6f6,31471,26480
4,0006246bee,0bab4a8104,31471,32061
...,...,...,...,...
616195,fffffe98d0,f6e4113a95,29367,35855
616196,fffffe98d0,f8cc22edf7,29367,17530
616197,fffffe98d0,fa71aa74e9,29367,1540
616198,fffffe98d0,fca8263961,29367,19908


In [11]:
predict_df = predict(svd, targets, usercol="UserIndex", itemcol="ItemIndex")
predict_df = predict_df.assign(UserId=predict_df["UserIndex"].map(reversed_user_indexes))
predict_df = predict_df.assign(ItemId=predict_df["ItemIndex"].map(reversed_item_indexes))
print(predict_df.head())

   UserIndex  ItemIndex  prediction      UserId      ItemId
0      31471      21530    6.815429  0006246bee  01d2404d4c
1      31471       5449    6.803461  0006246bee  03d43fdf92
2      31471      24104    6.726115  0006246bee  0808a9666b
3      31471      26480    7.113631  0006246bee  0a5d7dd6f6
4      31471      32061    6.488101  0006246bee  0bab4a8104


In [12]:
predict_df = predict_df.sort_values(by=["UserId", "prediction"], ascending=[True, False])
predict_df[["UserId", "ItemId"]].to_csv(f'./submissions/surprise-svd-submission_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv', index=False)