## Surprise library for recommender systems

http://surpriselib.com/

In [2]:
import surprise
import pandas as pd
import numpy as np

In [3]:
dat_dir = '../data/'

In [4]:
file_path = dat_dir + 'data_train.csv'
ratings_df = pd.read_csv(file_path)
ratings_df.head()

Unnamed: 0,Id,Prediction
0,r44_c1,4
1,r61_c1,3
2,r67_c1,4
3,r72_c1,3
4,r86_c1,5


In [None]:
r_c = np.array(list(map(lambda x:x.split('r_c'), ratings_df.Id)))

In [17]:
ratings_df['User'] = r_c[:,0]
ratings_df['Item'] = r_c[:,1]
ratings_df.head()

Unnamed: 0,Id,Prediction,User,Item
0,r44_c1,4,r44,c1
1,r61_c1,3,r61,c1
2,r67_c1,4,r67,c1
3,r72_c1,3,r72,c1
4,r86_c1,5,r86,c1


In [18]:
reader = surprise.Reader(rating_scale=(1, 5))
ratings = surprise.Dataset.load_from_df(ratings_df[['User', 'Item', 'Prediction']], reader)
ratings.split(n_folds=5)

In [19]:
from surprise import SVD
# lr_all -> learning_rates or gamma, reg -> regularizer term or lambda 
param_grid = {'n_epochs': [30], 'n_factors':[10, 20, 45, 75, 100, 150], 'lr_all': [0.005],
              'reg_pu': [1.0, 0.1, 0.01, 0.001, 0.0001], 'reg_qi': [1.0, 0.1, 0.01, 0.001, 0.0001]} # Add 'biased': [False]
grid_search = surprise.GridSearch(SVD, param_grid, measures=['RMSE'])
grid_search.evaluate(ratings)



Running grid search for the following parameter combinations:
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 1.0}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.1}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.01}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.001}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.0001}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 0.1, 'reg_qi': 1.0}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 0.1, 'reg_qi': 0.1}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 0.1, 'reg_qi': 0.01}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 0.1, 'reg_qi': 0.001}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 0.1, 'reg_qi': 0.0001}
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.005, 'reg_pu': 0.01, 'reg_qi': 1.0}
{'n_epochs': 30, 'n_factors': 10, 

KeyboardInterrupt: 

In [20]:
results_df = pd.DataFrame.from_dict(grid_search.cv_results)
print(results_df)

Empty DataFrame
Columns: []
Index: []


In [70]:
print("Minimal RMSE: {}".format(grid_search.best_score['RMSE']))
print("Best parameters for minimal RMSE: {}".format(grid_search.best_params['RMSE']))
# Best parameters for minimal RMSE: {'n_epochs': 30, 'n_factors': 150, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.001}

Minimal RMSE: 0.9878971066684914
Best parameters for minimal RMSE: {'n_epochs': 30, 'n_factors': 150, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.001}


In [74]:
### Surprise SVD is matrix factorization with bias
algo = SVD(n_factors = 150, n_epochs = 50, lr_all = 0.005, reg_pu = 1.0, reg_qi = 0.001)
avg_rmse = 0
for trainset, testset in ratings.folds():

    # train and test algorithm.
    print("Training")
    algo.train(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    rmse = surprise.accuracy.rmse(predictions, verbose=True)
    avg_rmse += rmse
    
print("avgRMSE = {}".format(avg_rmse/5))

Training
RMSE: 0.9962
Training
RMSE: 0.9951
Training
RMSE: 0.9965
Training
RMSE: 0.9962
Training
RMSE: 0.9951
avgRMSE = 0.9958195855936172


Train on whole data

In [75]:
ratings = surprise.Dataset.load_from_df(ratings_df[['User', 'Item', 'Prediction']], reader)
full_ratings = ratings.build_full_trainset()
algo = SVD(n_factors = 150, n_epochs = 50, lr_all = 0.005, reg_pu = 1.0, reg_qi = 0.001)
algo.train(trainset)

In [76]:
from pre_post_process import load_data, create_csv_submission

sample_ids, _ = load_data('sample_submission.csv')
target = []
for id_ in sample_ids:
    target.append((id_[0], id_[1], 0))

number of items: 1000, number of users: 10000


In [77]:
preds = algo.test(target)
preds

[Prediction(uid=37, iid=1, r_ui=0, est=3.303380133487924, details={'was_impossible': False}),
 Prediction(uid=73, iid=1, r_ui=0, est=3.1278513107130124, details={'was_impossible': False}),
 Prediction(uid=156, iid=1, r_ui=0, est=3.641172521912976, details={'was_impossible': False}),
 Prediction(uid=160, iid=1, r_ui=0, est=3.4410790947212728, details={'was_impossible': False}),
 Prediction(uid=248, iid=1, r_ui=0, est=3.3070704704179947, details={'was_impossible': False}),
 Prediction(uid=256, iid=1, r_ui=0, est=3.4933755839411487, details={'was_impossible': False}),
 Prediction(uid=284, iid=1, r_ui=0, est=3.2809821490171838, details={'was_impossible': False}),
 Prediction(uid=400, iid=1, r_ui=0, est=3.3396604402909498, details={'was_impossible': False}),
 Prediction(uid=416, iid=1, r_ui=0, est=3.6036486325993096, details={'was_impossible': False}),
 Prediction(uid=456, iid=1, r_ui=0, est=3.3991217106593457, details={'was_impossible': False}),
 Prediction(uid=474, iid=1, r_ui=0, est=2.65

### Submission

In [78]:
def getWantedPredictions(preds):
    """Computes array of the wanted predictions given the list of ids of the form [user_id, movie_id]"""
    wanted_predictions = []
    for pred in preds:
        wanted_predictions.append(pred.est)
    return wanted_predictions

In [79]:
predictions = np.array(getWantedPredictions(preds))
predictions[ np.where(predictions > 5.0 ) ] = 5.0
predictions[ np.where(predictions < 1.0)] = 1.0
print("Minimum prediction: {}, Maximum prediction: {}".format(np.min(predictions), np.max(predictions)))

Minimum prediction: 1.0, Maximum prediction: 5.0


In [80]:
create_csv_submission(sample_ids, predictions, "Surprise/SGD_surprise_all_train_best_param.csv")