In [None]:
import numpy as np
import pandas as pd
from surprise import KNNBaseline, accuracy
from surprise import BaselineOnly
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer
from surprise.model_selection.validation import cross_validate
from surprise.model_selection.split import KFold
from surprise.model_selection import GridSearchCV
from surprise import Reader, Dataset
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [70]:
df_ratings = pd.read_csv("../data/total_df.csv")
df_users = pd.read_csv("../data/users_fixed.csv")

In [72]:
avalaible_nootropics = np.unique(df_ratings["itemID"]) #we want to ignore nootropics that are not in the df
avalaible_nootropics = [nootropic for nootropic in avalaible_nootropics if len(df_ratings[df_ratings["itemID"] == nootropic]) > 40]
df_ratings = df_ratings[np.isin(df_ratings["itemID"], avalaible_nootropics)]


In [52]:
def f(row):
    if row["issue"] != "None / Unsure":
        if row["rating"] == 0:
            return -3
    else:
        if row["rating"] == 0:
            return -1
    return row["rating"]

def f2(row):
    if row["rating"] == 0:
            return -1
    return row["rating"]
    
        
for i, row in df_ratings.iterrows():
    df_ratings.loc[i, "rating"] = f(row)
    


In [76]:
final_model = KNNBaseline(**{'verbose': False, 'k': 100, 'min_k': 5, 
                             'sim_options': {'name': 'msd', 'user_based': False},
                          "bsl_options" :{'method': 'sgd', 
                                        'n_epochs': 100,
                                        'reg': 0.002}})
#final_model = BaselineOnly(bsl_options={"method":"sgd"})


In [77]:
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
new_trainset = Dataset.load_from_df(df_ratings, reader)

In [101]:
params_dic = {"verbose":[False], "bsl_options" :{'method': ['als', 'sgd'], 
                                                 'n_epochs': [10, 20, 50, 100],
                                                 'reg': [0, 0.02, 0.1],
                                                "learning_rate": [0.001, 0.005, 0.01]}}

knn_params_dic = {"verbose":[False], 
                  'k': [100], 
                  'min_k': [5], 
                  'sim_options': {'name': ['msd', "pearson_baseline"], 'user_based': [False, True]},
                  "bsl_options" :{'method': ['sgd'], 
                                'n_epochs': [100],
                                 'reg': [0, 0.02, 0.1]}}

In [102]:
grid = GridSearchCV(KNNBaseline, knn_params_dic, measures=['rmse', 'mae', "fcp"], cv=3)

In [103]:
grid.fit(new_trainset)

In [104]:
grid.best_params

{'rmse': {'verbose': False,
  'k': 100,
  'min_k': 5,
  'sim_options': {'name': 'pearson_baseline', 'user_based': True},
  'bsl_options': {'method': 'sgd', 'n_epochs': 100, 'reg': 0.1}},
 'mae': {'verbose': False,
  'k': 100,
  'min_k': 5,
  'sim_options': {'name': 'msd', 'user_based': False},
  'bsl_options': {'method': 'sgd', 'n_epochs': 100, 'reg': 0}},
 'fcp': {'verbose': False,
  'k': 100,
  'min_k': 5,
  'sim_options': {'name': 'pearson_baseline', 'user_based': True},
  'bsl_options': {'method': 'sgd', 'n_epochs': 100, 'reg': 0.02}}}

In [100]:
grid.best_score

{'rmse': 2.455543943929426,
 'mae': 1.9153149644467409,
 'fcp': 0.6049430593763105}

In [75]:
res = cross_validate(final_model, new_trainset, measures=["rmse", "mae", "fcp"])
print(np.mean(res["test_rmse"]))
print(np.mean(res["test_mae"]))
print(np.mean(res["test_fcp"]))
print(res)

2.4419146210751093
1.8992345085265119
0.5840285915133409
{'test_rmse': array([2.4576857 , 2.50718708, 2.40659158, 2.4652947 , 2.37281405]), 'test_mae': array([1.91544607, 1.94648298, 1.87868584, 1.91613483, 1.83942282]), 'test_fcp': array([0.58801761, 0.58818417, 0.60130303, 0.57234044, 0.5702977 ]), 'fit_time': (0.15184688568115234, 0.14649391174316406, 0.1557629108428955, 0.2044532299041748, 0.14846014976501465), 'test_time': (0.10621023178100586, 0.11166524887084961, 0.1110830307006836, 0.11615633964538574, 0.1062779426574707)}


In [78]:
res = cross_validate(final_model, new_trainset, measures=["rmse", "mae", "fcp"])
print(np.mean(res["test_rmse"]))
print(np.mean(res["test_mae"]))
print(np.mean(res["test_fcp"]))
print(res)

2.445738454728633
1.9010146151319982
0.5861893264896646
{'test_rmse': array([2.40247625, 2.4640791 , 2.45771417, 2.45001427, 2.45440848]), 'test_mae': array([1.85498009, 1.91510007, 1.91765175, 1.90554702, 1.91179415]), 'test_fcp': array([0.60360608, 0.57632398, 0.56974056, 0.59788485, 0.58339116]), 'fit_time': (0.15703082084655762, 0.1478278636932373, 0.14669013023376465, 0.14349007606506348, 0.14591693878173828), 'test_time': (0.10558724403381348, 0.10519623756408691, 0.1049349308013916, 0.11634111404418945, 0.10721778869628906)}


In [55]:
cv = KFold()
rmse = []
mae = []
for train, test in cv.split(new_trainset):
    final_model.fit(train)
    prediction = lambda a: final_model.predict(uid=a[0], iid=a[1]).est
    prediction_fixed = lambda row: 0 if (prediction(row) <=0) else prediction(row)
    y = [a[2] for a in test]
    y = list(map(lambda x: 0 if x<=0 else x, y))
    y_pred = [prediction_fixed(a) for a in test]
    print("########")
    rmse.append(mean_squared_error(y, y_pred, squared=False))
    mae.append(mean_absolute_error(y, y_pred))
print(np.mean(rmse))
print(np.mean(mae))

########
########
########
########
########
2.516999638562034
1.9078746384083956


In [49]:
cv = KFold()
rmse = []
mae = []
for train, test in cv.split(new_trainset):
    final_model.fit(train)
    prediction = lambda a: final_model.predict(uid=a[0], iid=a[1]).est
    prediction_fixed = lambda row: 0 if (prediction(row) <=0) else prediction(row)
    y = [a[2] for a in test]
    y = list(map(lambda x: 0 if x<=0 else x, y))
    y_pred = [prediction_fixed(a) for a in test]
    print("########")
    rmse.append(mean_squared_error(y, y_pred, squared=False))
    mae.append(mean_absolute_error(y, y_pred))
print(np.mean(rmse))
print(np.mean(mae))


########
########
########
########
########
2.503880337278288
1.9019261251306006


TypeError: 'KFold' object is not iterable