## Surprise library for recommender systems

http://surpriselib.com/

In [8]:
import surprise
import pandas as pd
import numpy as np
import re
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
dat_dir = '../data/'

In [4]:
file_path = dat_dir + 'data_train.csv'
ratings_df = pd.read_csv(file_path)
ratings_df.head()

Unnamed: 0,Id,Prediction
0,r44_c1,4
1,r61_c1,3
2,r67_c1,4
3,r72_c1,3
4,r86_c1,5


In [23]:
r_c = np.array(list(map(lambda x:re.split("[r_c]", x), ratings_df.Id)))

In [25]:
ratings_df['User'] = r_c[:,1]
ratings_df['Item'] = r_c[:,3]
ratings_df.head()

Unnamed: 0,Id,Prediction,User,Item
0,r44_c1,4,44,1
1,r61_c1,3,61,1
2,r67_c1,4,67,1
3,r72_c1,3,72,1
4,r86_c1,5,86,1


In [26]:
from surprise import SVD
from surprise.model_selection import split
reader = surprise.Reader(rating_scale=(1, 5))
ratings = surprise.Dataset.load_from_df(ratings_df[['User', 'Item', 'Prediction']], reader)
# ratings.split(n_folds=5)

In [32]:
from surprise import SVD
# lr_all -> learning_rates or gamma, reg -> regularizer term or lambda 
param_grid = {'n_epochs': [30], 'n_factors':[10, 20, 45, 75, 100, 150], 'lr_all': [0.005],
              'reg_pu': [1.0, 0.1, 0.01, 0.001, 0.0001], 'reg_qi': [1.0, 0.1, 0.01, 0.001, 0.0001]} # Add 'biased': [False]
grid_search = surprise.GridSearch(SVD, param_grid, measures=['RMSE'])
grid_search.evaluate(ratings)


from surprise.model_selection import GridSearchCV

# Use movielens-100K
data = ratings

param_grid = {'n_epochs': [30], 'n_factors':[10, 20, 45, 75, 100, 150], 'lr_all': [0.005],
              'reg_pu': [1.0, 0.1, 0.01, 0.001, 0.0001], 'reg_qi': [1.0, 0.1, 0.01, 0.001, 0.0001]} # Add 'biased': [False]

gs = GridSearchCV(SVD, data, measures=['rmse', ], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])



AttributeError: 'GridSearch' object has no attribute 'cross_validate'

In [28]:
results_df = pd.DataFrame.from_dict(grid_search.cv_results)
print(results_df)

Empty DataFrame
Columns: []
Index: []


In [70]:
print("Minimal RMSE: {}".format(grid_search.best_score['RMSE']))
print("Best parameters for minimal RMSE: {}".format(grid_search.best_params['RMSE']))
# Best parameters for minimal RMSE: {'n_epochs': 30, 'n_factors': 150, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.001}

Minimal RMSE: 0.9878971066684914
Best parameters for minimal RMSE: {'n_epochs': 30, 'n_factors': 150, 'lr_all': 0.005, 'reg_pu': 1.0, 'reg_qi': 0.001}


In [51]:
### Surprise SVD is matrix factorization with bias
algo = SVD(n_factors = 100, n_epochs = 50, lr_all = 0.005, reg_pu = 1, reg_qi = 0.001)
avg_rmse = 0
ratings.split(n_folds=3)
for trainset, testset in ratings.folds():

    # train and test algorithm.
    print("Training")
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    rmse = surprise.accuracy.rmse(predictions, verbose=True)
    avg_rmse += rmse
    
print("avgRMSE = {}".format(avg_rmse/3))



Training
RMSE: 1.0763
Training
RMSE: 1.0740
Training
RMSE: 1.0755
avgRMSE = 0.6451634838798868


Train on whole data

In [45]:
ratings = surprise.Dataset.load_from_df(ratings_df[['User', 'Item', 'Prediction']], reader)
full_ratings = ratings.build_full_trainset()
algo = SVD(n_factors = 100, n_epochs = 50, lr_all = 0.025, reg_pu = 0.1, reg_qi = 0.01)
algo.fit(full_ratings)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13a329f98>

In [46]:
algo

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13a329f98>

In [57]:
from pre_post_process import load_data, create_csv_submission

sample_ids, _ = load_data('../data/sample_submission.csv')
target = []
for id_ in sample_ids:
    target.append((id_[0], id_[1],0))

number of items: 1000, number of users: 10000


In [64]:
sample_ids[:,0]

array([  37,   73,  156, ..., 9978, 9982, 9996])

In [71]:
preds =algo.predict(sample_ids[100,0],sample_ids[100,1],verbose=True)
preds

user: 3303       item: 1          r_ui = None   est = 3.86   {'was_impossible': False}


Prediction(uid=3303, iid=1, r_ui=None, est=3.8565562331529946, details={'was_impossible': False})

In [58]:
preds = algo.test(target)
preds

[Prediction(uid=37, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=73, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=156, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=160, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=248, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=256, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=284, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=400, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=416, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=456, iid=1, r_ui=0, est=3.8565562331529946, details={'was_impossible': False}),
 Prediction(uid=474, iid=1, r_ui=0, est=3.

### Submission

In [37]:
def getWantedPredictions(preds):
    """Computes array of the wanted predictions given the list of ids of the form [user_id, movie_id]"""
    wanted_predictions = []
    for pred in preds:
        wanted_predictions.append(pred.est)
    return wanted_predictions

In [38]:
predictions = np.array(getWantedPredictions(preds))
predictions[ np.where(predictions > 5.0 ) ] = 5.0
predictions[ np.where(predictions < 1.0)] = 1.0
print("Minimum prediction: {}, Maximum prediction: {}".format(np.min(predictions), np.max(predictions)))

Minimum prediction: 3.8574719455543023, Maximum prediction: 3.8574719455543023


In [80]:
create_csv_submission(sample_ids, predictions, "Surprise/SGD_surprise_all_train_best_param.csv")

In [39]:
predictions

array([3.85747195, 3.85747195, 3.85747195, ..., 3.85747195, 3.85747195,
       3.85747195])