In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import feather
from evaluator import Evaluator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from tqdm import tqdm_notebook as tqdm
from scipy import sparse

# Load data

In [2]:
training_ratings = feather.read_dataframe('./feather/training_ratings')
testing_ratings = feather.read_dataframe('./feather/testing_ratings')
book_profiles = feather.read_dataframe('./feather/book_profiles').set_index('book_id').to_sparse(fill_value=0)
novelty_scores = feather.read_dataframe('./feather/novelty_scores').set_index('book_id')
books = feather.read_dataframe('./feather/books').set_index('book_id')

In [3]:
book_sim = pd.DataFrame(
    data = cosine_similarity(book_profiles, book_profiles),
    index = book_profiles.index,
    columns = book_profiles.index
)

book_sim.head()

book_id,27,21,2,18,24,3275,3753,54,337,374,...,5111,5296,8713,7443,6428,7523,4594,9569,9580,8892
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.0,0.793039,0.967074,0.935959,0.932615,0.773161,0.826935,0.443948,0.383335,0.145548,...,0.191713,0.27692,0.220817,0.665664,0.613295,0.679846,0.121582,0.397349,0.165843,0.226996
21,0.793039,1.0,0.781584,0.756703,0.802735,0.606846,0.642205,0.371239,0.290013,0.128655,...,0.132455,0.262205,0.181788,0.4833,0.445195,0.511246,0.116972,0.341285,0.114551,0.164359
2,0.967074,0.781584,1.0,0.954254,0.95135,0.779767,0.8387,0.463165,0.400693,0.146866,...,0.159402,0.254301,0.199595,0.665316,0.612849,0.679313,0.122899,0.366076,0.148561,0.172504
18,0.935959,0.756703,0.954254,1.0,0.919456,0.750132,0.813695,0.444069,0.384745,0.159378,...,0.153586,0.262145,0.192761,0.64641,0.60713,0.660081,0.135276,0.368633,0.142581,0.166399
24,0.932615,0.802735,0.95135,0.919456,1.0,0.741594,0.802514,0.488001,0.413613,0.164158,...,0.157931,0.270466,0.19893,0.636164,0.583238,0.648697,0.13925,0.39871,0.146673,0.171259


In [4]:
evl = Evaluator(
    k = 10,
    training_ratings = training_ratings,
    testing_ratings = testing_ratings,
    book_sim = book_sim,
    novelty_scores = novelty_scores
)

HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))




# Preprocess for the Content-based RS and Item-Item CF RS

In [5]:
users_mean_rating = training_ratings.groupby('user_id').mean()[['rating']]
training_ratings['adjusted_rating'] = training_ratings[['rating']] - users_mean_rating.loc[training_ratings.user_id].values
training_ratings.head()

Unnamed: 0,user_id,book_id,rating,adjusted_rating
0,2,4081,4,-0.269231
1,2,2318,3,-1.269231
2,2,26,4,-0.269231
3,2,315,3,-1.269231
4,2,33,4,-0.269231


In [6]:
user_c = CategoricalDtype(sorted(training_ratings.user_id.unique()), ordered=True)
book_c = CategoricalDtype(sorted(training_ratings.book_id.unique()), ordered=True)

row = training_ratings.user_id.astype(user_c).cat.codes
col = training_ratings.book_id.astype(book_c).cat.codes
sparse_matrix = csr_matrix((training_ratings["adjusted_rating"], (row, col)), \
                           shape=(user_c.categories.size, book_c.categories.size))

sparse_matrix

<52363x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 5206758 stored elements in Compressed Sparse Row format>

In [7]:
cf_sim = pd.DataFrame(
    data = cosine_similarity(sparse_matrix.T, sparse_matrix.T),
    index = book_c.categories,
    columns = book_c.categories)
cf_sim.shape

(10000, 10000)

In [8]:
cf_sim.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
1,1.0,0.160908,-0.031925,0.057201,-0.032354,0.097614,0.003601,-0.033949,0.019652,0.062751,...,0.000213,-0.000544,-0.002776,0.006636,-0.001584,-0.004744,-0.003601,-0.001484,-0.008362,-0.001154
2,0.160908,1.0,-0.140739,0.077358,-0.041489,0.055014,0.070666,-0.055633,0.000736,0.079883,...,-0.003834,-0.002613,0.00313,0.008077,-0.00064,-0.003086,0.001596,0.001712,-0.002568,0.001918
3,-0.031925,-0.140739,1.0,-0.102555,0.01804,-0.03082,-0.074245,0.045801,0.092864,-0.042211,...,0.000588,0.000722,-0.001307,-0.012517,0.005616,-0.004577,-0.002103,-0.007541,-0.005134,-0.005772
4,0.057201,0.077358,-0.102555,1.0,0.059419,0.029309,0.040175,0.02102,-0.066932,0.104644,...,-0.003452,-0.002976,0.003553,-0.007009,-0.01098,0.00192,0.001449,0.005182,-0.004473,0.002732
5,-0.032354,-0.041489,0.01804,0.059419,1.0,0.002683,-0.022331,0.143918,-0.00922,0.022881,...,-0.003136,0.001688,-0.004796,0.002315,0.011939,-0.005,0.004715,0.005097,-0.001746,0.007714


In [9]:
cf_top_sim_books = {}
book_ids = cf_sim.index
for book_id in tqdm(book_ids):
    cf_top_sim_books[book_id] = cf_sim.loc[book_id].sort_values(ascending=False)[1:51]

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [13]:
cf_top_sim_books[1].head()

17    0.338596
20    0.172852
2     0.160908
31    0.147547
25    0.146911
Name: 1, dtype: float64

In [10]:
cb_top_sim_books = {}
book_ids = book_sim.index
for book_id in tqdm(book_ids):
    cb_top_sim_books[book_id] = book_sim.loc[book_id].sort_values(ascending=False)[1:51]

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [18]:
cb_top_sim_books[1].head()

book_id
20     0.947613
17     0.937363
507    0.910072
12     0.848734
91     0.842863
Name: 1, dtype: float64

In [11]:
list_of_5_ratings = training_ratings[training_ratings.rating==5].groupby('user_id')['book_id'].apply(list)

# Hybrid Recommender System

We'll make the hybrid recommender flexible by enabling it to take the proportional rate as argument to its constructor

In [22]:
class HybridRecommender:
    name = "Hybrid CF RS"
    preds = {}

    def __init__(self, rate=1):
        self.rate = rate
        self.name = "Hybrid CF RS (rate=" + str(rate) + ")"

    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        self.preds = {}
        for user_id in tqdm(user_ids):
            excluded_books = training_ratings[training_ratings.user_id == user_id].book_id.unique(
            ).tolist()
            most_similar_books = pd.Series([])
            for book_id in list_of_5_ratings[user_id]:
                most_similar_books = most_similar_books.append(
                    cb_top_sim_books[book_id])
                most_similar_books = most_similar_books.append(
                    cf_top_sim_books[book_id] * self.rate)

            most_similar_books = np.array(most_similar_books.groupby(
                most_similar_books.index).sum().sort_values(ascending=False).index)
            recommendable = most_similar_books[~np.in1d(
                most_similar_books, excluded_books)]

            self.preds[user_id] = recommendable[:10].tolist()

    def recommendation_for_user(self, user_id):
        if user_id not in self.preds:
            return []
        return self.preds[user_id]

    def all_recommendation(self):
        return self.preds

## rate = 1

In [24]:
hb_rec = HybridRecommender(rate=1)
hb_rec.name

'Hybrid CF RS (rate=1)'

In [25]:
evl.evaluate(hb_rec)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1)
Mean Average Precision                10.79%
Coverage                              81.78%
Novelty Score                           6.16
Diversity Score                         2.62
Personalization Score                   9.88


## rate = 2

In [26]:
hb_rec2 = HybridRecommender(rate=2)
evl.evaluate(hb_rec2)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)
Mean Average Precision                10.79%                13.38%
Coverage                              81.78%                82.50%
Novelty Score                           6.16                  6.04
Diversity Score                         2.62                  2.78
Personalization Score                   9.88                  9.89


## rate = 4

In [27]:
hb_rec4 = HybridRecommender(rate=4)
evl.evaluate(hb_rec4)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4)  
Mean Average Precision                16.90%  
Coverage                              83.23%  
Novelty Score                           5.83  
Diversity Score                         3.08  
Personalization Score                   9.88  


# Alternate version

In [29]:
class AltHybridRecommender:
    name = "Alt Hybrid RS"
    preds = {}

    def __init__(self, rate=1):
        self.rate = rate
        self.name = "Alt Hybrid RS (rate=" + str(rate) + ")"

    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        self.preds = {}
        for user_id in tqdm(user_ids):
            excluded_books = training_ratings[training_ratings.user_id == user_id].book_id.unique(
            ).tolist()
            most_similar_books = pd.Series([])
            for book_id in list_of_5_ratings[user_id]:
                most_similar_books = most_similar_books.append(
                    cb_top_sim_books[book_id])
                most_similar_books = most_similar_books.append(
                    cf_top_sim_books[book_id] + self.rate)

            most_similar_books = np.array(most_similar_books.groupby(
                most_similar_books.index).sum().sort_values(ascending=False).index)
            recommendable = most_similar_books[~np.in1d(
                most_similar_books, excluded_books)]

            self.preds[user_id] = recommendable[:10].tolist()

    def recommendation_for_user(self, user_id):
        if user_id not in self.preds:
            return []
        return self.preds[user_id]

    def all_recommendation(self):
        return self.preds

In [30]:
ahb_rec = AltHybridRecommender()
evl.evaluate(ahb_rec)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4) Alt Hybrid RS (rate=1)  
Mean Average Precision                16.90%                 16.33%  
Coverage                              83.23%                 88.06%  
Novelty Score                           5.83                   5.06  
Diversity Score                         3.08                   3.75  
Personalization Score                   9.88                   9.73  


# Experiment

In [32]:
hb_rec7 = HybridRecommender(rate=7)
evl.evaluate(hb_rec7)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4) Alt Hybrid RS (rate=1)  \
Mean Average Precision                16.90%                 16.33%   
Coverage                              83.23%                 88.06%   
Novelty Score                           5.83                   5.06   
Diversity Score                         3.08                   3.75   
Personalization Score                   9.88                   9.73   

                       Hybrid CF RS (rate=7)  
Mean Average Precision                19.74%  
Coverage                              83.73%  
Novelty Sc

In [33]:
hb_rec10 = HybridRecommender(rate=10)
evl.evaluate(hb_rec10)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4) Alt Hybrid RS (rate=1)  \
Mean Average Precision                16.90%                 16.33%   
Coverage                              83.23%                 88.06%   
Novelty Score                           5.83                   5.06   
Diversity Score                         3.08                   3.75   
Personalization Score                   9.88                   9.73   

                       Hybrid CF RS (rate=7) Hybrid CF RS (rate=10)  
Mean Average Precision                19.74%                 21.13%  
Coverage   

In [35]:
hb_rec15 = HybridRecommender(rate=15)
evl.evaluate(hb_rec15)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4) Alt Hybrid RS (rate=1)  \
Mean Average Precision                16.90%                 16.33%   
Coverage                              83.23%                 88.06%   
Novelty Score                           5.83                   5.06   
Diversity Score                         3.08                   3.75   
Personalization Score                   9.88                   9.73   

                       Hybrid CF RS (rate=7) Hybrid CF RS (rate=10)  \
Mean Average Precision                19.74%                 21.13%   
Coverage 

In [36]:
hb_rec20 = HybridRecommender(rate=20)
evl.evaluate(hb_rec20)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4) Alt Hybrid RS (rate=1)  \
Mean Average Precision                16.90%                 16.33%   
Coverage                              83.23%                 88.06%   
Novelty Score                           5.83                   5.06   
Diversity Score                         3.08                   3.75   
Personalization Score                   9.88                   9.73   

                       Hybrid CF RS (rate=7) Hybrid CF RS (rate=10)  \
Mean Average Precision                19.74%                 21.13%   
Coverage 

In [37]:
hb_rec30 = HybridRecommender(rate=30)
evl.evaluate(hb_rec30)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4) Alt Hybrid RS (rate=1)  \
Mean Average Precision                16.90%                 16.33%   
Coverage                              83.23%                 88.06%   
Novelty Score                           5.83                   5.06   
Diversity Score                         3.08                   3.75   
Personalization Score                   9.88                   9.73   

                       Hybrid CF RS (rate=7) Hybrid CF RS (rate=10)  \
Mean Average Precision                19.74%                 21.13%   
Coverage 

In [38]:
hb_rec50 = HybridRecommender(rate=50)
evl.evaluate(hb_rec50)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Hybrid CF RS (rate=1) Hybrid CF RS (rate=2)  \
Mean Average Precision                10.79%                13.38%   
Coverage                              81.78%                82.50%   
Novelty Score                           6.16                  6.04   
Diversity Score                         2.62                  2.78   
Personalization Score                   9.88                  9.89   

                       Hybrid CF RS (rate=4) Alt Hybrid RS (rate=1)  \
Mean Average Precision                16.90%                 16.33%   
Coverage                              83.23%                 88.06%   
Novelty Score                           5.83                   5.06   
Diversity Score                         3.08                   3.75   
Personalization Score                   9.88                   9.73   

                       Hybrid CF RS (rate=7) Hybrid CF RS (rate=10)  \
Mean Average Precision                19.74%                 21.13%   
Coverage 