# Recommender System with Surprise



In [None]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 6.3 MB/s 
Building wheels for collected packages: scikit-surprise


### import necessary packages
### We will use surprise package to implement our recommendation system

In [None]:
import surprise
import numpy as np
import pandas as pd

### Dataset is download through <br>
https://www.librec.net/datasets.html

In [None]:
dataset = pd.read_table('ratings.txt', sep=' ', names = ['user_id', 'item_id', 'rating'])

### Let's take a look at the dataset 

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
'There are total {} users'.format(len(set(dataset['user_id'])))

In [None]:
'There are total {} films'.format(len(set(dataset['item_id'])))

In [None]:
dataset['rating'].describe()

### Load the dataset into a specified structor under surprise package
There are multiple ways in loading your dataset<br>
Please find the following link for alternative options:<br>
https://surprise.readthedocs.io/en/stable/dataset.html

In [None]:
reader = surprise.Reader(rating_scale = (0.5, 4.))

In [None]:
data = surprise.dataset.Dataset.load_from_df(dataset, reader)

### different options for similarity measurement and two methods of collaborative filtering 

In [None]:
sim_options = {'name': 'pearson_baseline', 'user_based': True}
sim_options1 = {'name': 'cosine', 'user_based': True}
sim_options2 = {'name': 'pearson_baseline', 'user_based': False}
sim_options3 = {'name': 'cosine', 'user_based': False}

### There are also multiple estimation approaches under surprise:
https://surprise.readthedocs.io/en/stable/knn_inspired.html

### Let's first build a model with Basic KNN method

In [None]:
alg0 = surprise.KNNBasic(k=20, sim_options = sim_options)
output = alg0.fit(data.build_full_trainset())

### Get predictions for specific user and item

In [None]:
alg0.predict(uid = 10, iid = 30)

In [None]:
alg0.predict(uid = 20, iid = 20)

### Let's then build a model with KNN method with Means

In [None]:
alg1 = surprise.KNNWithMeans(k=20, sim_options = sim_options)
output = alg1.fit(data.build_full_trainset())

In [None]:
alg1.predict(uid = 10, iid = 30)

In [None]:
alg1.predict(uid = 20, iid = 20)

## <span style="color:red"> *Can you replicate this procedure using item-base approach?* </span>

### Let's then use SVD procedure to build a model

In [None]:
alg2 = surprise.SVD(n_factors = 20, n_epochs=30, lr_all = 0.01, reg_all = 0.01)
output = alg2.fit(data.build_full_trainset())

In [None]:
alg2.predict(uid = 10, iid = 30)

In [None]:
alg2.predict(uid = 20, iid = 20)

### Let's name all missing data as predset

In [None]:
predset = data.build_full_trainset().build_anti_testset()

In [None]:
predictions = alg2.test(predset)

### Return the top-N recommendation for each user from a set of predictions.
Args: <br>
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.<br>
        n(int): The number of recommendation to output for each user. Default
            is 10.<br>
Returns:<br>
        A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.

In [None]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [None]:
top_n = get_top_n(predictions, n=5)

In [None]:
top_n[38]

### We can also apply Grid Search to tune parameters

In [None]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_factors': [60, 80, 100], 'n_epochs': [50, 70, 90], 'lr_all': [0.001, 0.01, 0.05],
              'reg_all': [0.01, 0.05, 0.1]}
gs = GridSearchCV(surprise.SVD, param_grid, measures=['rmse', 'mae'], cv=5)

In [None]:
gs.fit(data)

In [None]:
print(gs.best_params['rmse'])

In [None]:
print(gs.best_score['rmse'])

In [None]:
gs.best_params['rmse']

In [None]:
alg3 = surprise.SVD(n_factors = 80, n_epochs = 70, lr_all = 0.01, reg_all =0.1)
output = alg3.fit(data.build_full_trainset())

In [None]:
alg3.predict(uid = 10, iid = 30)

In [None]:
alg3.predict(uid = 20, iid = 20)

### We can also use cross-validation to measure our model
https://surprise.readthedocs.io/en/stable/model_selection.html

In [None]:
from surprise.model_selection import cross_validate
cross_validate(alg2, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
cross_validate(alg3, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
from surprise import KNNBasic, KNNWithMeans, SVD
from surprise.model_selection import cross_validate

benchmark = []
# Iterate over all algorithms
for algorithm in [surprise.KNNBasic(sim_options = sim_options),
                  surprise.KNNWithMeans(sim_options = sim_options),
                  surprise.KNNBasic(sim_options = sim_options1),
                  surprise.KNNWithMeans(sim_options = sim_options1),
                  surprise.KNNBasic(sim_options = sim_options2),
                  surprise.KNNWithMeans(sim_options = sim_options2),
                  surprise.KNNBasic(sim_options = sim_options3),
                  surprise.KNNWithMeans(sim_options = sim_options3),
                  surprise.SVD(n_factors = 20, n_epochs=30, lr_all = 0.01, reg_all = 0.01),
                  surprise.SVD(n_factors = 80, n_epochs = 70, lr_all = 0.01, reg_all =0.1)]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm')