In [1]:
!pwd

/Users/ashish1610dhiman/data_projects/fall22_hw/RoboChef/notebooks/recommendation


In [2]:
import pandas as pd
from surprise import Dataset
from surprise import Reader

from surprise import accuracy, Dataset, SVD,SVDpp, NMF 
from surprise.model_selection import train_test_split as train_test_split_surprise
from surprise.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import numpy as np


In [3]:
TRAIN__Validation_SIZE = 0.8
TEST_SIZE = 0.2

### Read the Train and Test

In [4]:
train_validation = pd.read_csv("../../data/recommendation/ad_interaction_train.csv")
test_set = pd.read_csv("../../data/recommendation/ad_interaction_test.csv")
train_validation.shape,test_set.shape

((887483, 5), (244884, 5))

### Cross validate on multiple Model types

In [5]:
reader = Reader(rating_scale=(1, 5))
cv_data = Dataset.load_from_df(train_validation[["user_id",\
                                                 "recipe_id", "rating"]], reader)

In [6]:
param_grid = {"n_factors":[5, 25] ,"n_epochs": [20, 250], "lr_all": [0.001, 0.006],\
             "reg_all":[0.01,0.08]}
param_grid2 = {"n_factors":[5, 25] ,"n_epochs": [2, 20], "reg_pu":[0.01,0.1], "reg_qi":[0.01,0.1]}

#### SVD, SVDpp, NMF

In [7]:
gs_svd = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5, n_jobs = -2, joblib_verbose=3)
# gs_svdpp = GridSearchCV(SVDpp, param_grid, measures=["rmse", "mae"], cv=5, n_jobs = -2, joblib_verbose=3)
gs_nmf = GridSearchCV(NMF, param_grid2, measures=["rmse", "mae"], cv=5, n_jobs = -2, joblib_verbose=3)

In [8]:
gs_nmf.fit(cv_data)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-2)]: Done  80 out of  80 | elapsed:  3.6min finished


In [9]:
gs_svd.fit(cv_data)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-2)]: Done  80 out of  80 | elapsed:  8.0min finished


In [None]:
# gs_svdpp.fit(cv_data)

#### Compare results

In [10]:
gs_nmf.best_score["rmse"]

1.293538122735244

In [11]:
gs_result = {}
for algo,gs in {"svd":gs_svd,"nmf":gs_nmf}.items():
    gs_result[algo] = [gs.best_score["rmse"],gs.best_score["mae"]]

In [12]:
gs_result_df = pd.DataFrame.from_dict(gs_result)
gs_result_df.index = ["rmse","mae"]
gs_result_df

Unnamed: 0,svd,nmf
rmse,1.210064,1.293538
mae,0.731275,0.649197


In [13]:
gs_svd.best_params["rmse"]

{'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.006, 'reg_all': 0.08}

In [14]:
gs_nmf.best_params["mae"]

{'n_factors': 25, 'n_epochs': 2, 'reg_pu': 0.01, 'reg_qi': 0.01}

#### Build model on full cv set

In [15]:
model1 = gs_svd.best_estimator["rmse"]
model1.fit(cv_data.build_full_trainset())

model2 = gs_nmf.best_estimator["mae"]
model2.fit(cv_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1a58b7250>

#### Write models to pickle

In [16]:
import sys
from os.path import dirname
sys.path.append(dirname("../../"))

In [17]:
from src.utilities import *

write_pickle(model1, "../../models/reccomender_model1_svd.pkl")
write_pickle(model2, "../../models/reccomender_model2_nmf.pkl")

### Evaluate Rating Prediction

In [18]:
test_set_surprise = Dataset.load_from_df(test_set[["user_id", "recipe_id", "rating"]], reader)
_, test_set_surprise = train_test_split_surprise(test_set_surprise, test_size=1.0)

In [19]:
predictions1 = model1.test(test_set_surprise)
accuracy.rmse(predictions1, verbose=False), accuracy.mae(predictions1, verbose=False)

(1.2398769202951172, 0.7595275258894878)

In [20]:
predictions2 = model2.test(test_set_surprise)
accuracy.rmse(predictions2, verbose=False), accuracy.mae(predictions2, verbose=False)

(1.3545639517394856, 0.6612653662869259)

### Evaluate Rating Prediction on interaction filter

In [121]:
raw_interactions = pd.read_csv("../../data/recommendation/RAW_interactions.csv")
raw_interactions.shape

(1132367, 5)

In [122]:
interaction_cnts_recipes = raw_interactions["user_id"].value_counts()

In [123]:
mask_atleastn_users = interaction_cnts_recipes>=5
mask_atleastn_users.sum()

23086

In [124]:
x1 = pd.DataFrame(mask_atleastn_users)
mask1 = raw_interactions["user_id"].isin(x1.loc[x1["user_id"]==True].index)

raw_subset = raw_interactions[mask1]

In [125]:
raw_subset[["user_id","recipe_id"]].nunique()

user_id       23086
recipe_id    211039
dtype: int64

In [126]:
raw_subset.shape

(872021, 5)

In [127]:
(raw_subset["user_id"].value_counts()>=5).mean()

1.0

In [128]:
(raw_subset["recipe_id"].value_counts()>=5).mean()

0.20148408587986108

In [129]:
raw_subset.to_csv("../../data/reccomendation_subset_data.csv",index=False)

In [134]:
test_subset = test_set[test_set["user_id"].isin(set(raw_subset["user_id"]))]

In [135]:
test_set_surprise_subset = Dataset.load_from_df(test_subset[["user_id", "recipe_id", "rating"]], reader)
_, test_set_surprise_subset = train_test_split_surprise(test_set_surprise_subset, test_size=1.0)

In [136]:
predictions1_sub = model1.test(test_set_surprise_subset)
accuracy.rmse(predictions1_sub, verbose=False), accuracy.mae(predictions1_sub, verbose=False)

(0.9433350451921445, 0.576491187436348)

In [137]:
predictions2_sub = model2.test(test_set_surprise_subset)
accuracy.rmse(predictions2_sub, verbose=False), accuracy.mae(predictions2_sub, verbose=False)

(1.0657827857655318, 0.46979172043686335)