In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import feather
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm

In [2]:
book_profiles = feather.read_dataframe('./feather/book_profiles').set_index('book_id')
books = feather.read_dataframe('./feather/books').set_index('book_id')
training_ratings = feather.read_dataframe('./feather/training_ratings')
testing_ratings = feather.read_dataframe('./feather/testing_ratings')

In [3]:
book_sim = pd.DataFrame(
    data = cosine_similarity(book_profiles, book_profiles),
    index = book_profiles.index,
    columns = book_profiles.index
)

book_sim.head()

book_id,27,21,2,18,24,3275,3753,54,337,374,...,5111,5296,8713,7443,6428,7523,4594,9569,9580,8892
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.0,0.793039,0.967074,0.935959,0.932615,0.773161,0.826935,0.443948,0.383335,0.145548,...,0.191713,0.27692,0.220817,0.665664,0.613295,0.679846,0.121582,0.397349,0.165843,0.226996
21,0.793039,1.0,0.781584,0.756703,0.802735,0.606846,0.642205,0.371239,0.290013,0.128655,...,0.132455,0.262205,0.181788,0.4833,0.445195,0.511246,0.116972,0.341285,0.114551,0.164359
2,0.967074,0.781584,1.0,0.954254,0.95135,0.779767,0.8387,0.463165,0.400693,0.146866,...,0.159402,0.254301,0.199595,0.665316,0.612849,0.679313,0.122899,0.366076,0.148561,0.172504
18,0.935959,0.756703,0.954254,1.0,0.919456,0.750132,0.813695,0.444069,0.384745,0.159378,...,0.153586,0.262145,0.192761,0.64641,0.60713,0.660081,0.135276,0.368633,0.142581,0.166399
24,0.932615,0.802735,0.95135,0.919456,1.0,0.741594,0.802514,0.488001,0.413613,0.164158,...,0.157931,0.270466,0.19893,0.636164,0.583238,0.648697,0.13925,0.39871,0.146673,0.171259


# 0.5. Measuring novelty

Paper: https://arxiv.org/pdf/0808.2670.pdf

For our application, the Novelty Score is measure by the average log inverse ratio of users who have rated the item. For that we need to create a new novelty score dataframe

In [4]:
n_users = len(training_ratings.user_id.unique())
n_users

52363

In [5]:
rating_count = training_ratings.groupby('book_id').count()[['rating']]
rating_count.head()

Unnamed: 0_level_0,rating
book_id,Unnamed: 1_level_1
1,18732
2,17460
3,15434
4,15439
5,14841


In [6]:
rating_count.loc[:, 'novelty_score'] = np.log2(n_users / rating_count.rating)

In [7]:
sorted_scores = rating_count.sort_values(by="novelty_score", ascending=True)
sorted_scores.head()

Unnamed: 0_level_0,rating,novelty_score
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,18732,1.483043
2,17460,1.584494
4,15439,1.761968
3,15434,1.762436
5,14841,1.818959


In [8]:
sorted_scores.tail()

Unnamed: 0_level_0,rating,novelty_score
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9553,29,10.818279
1935,27,10.921373
9486,20,11.354332
9345,10,12.354332
7803,8,12.67626


In [9]:
novelty_scores = sorted_scores[['novelty_score']]
novelty_scores.head(10)

Unnamed: 0_level_0,novelty_score
book_id,Unnamed: 1_level_1
1,1.483043
2,1.584494
4,1.761968
3,1.762436
5,1.818959
17,1.885708
20,1.886625
7,2.007597
23,2.013703
8,2.022631


In [10]:
novelty_scores.tail(10).novelty_score.mean()

11.040854404462621

# 1. The new evaluator

In [33]:
class Evaluator():
    def __init__(self, k=10, training_ratings=None, testing_ratings=None, book_sim=None, novelty_scores=None):
        self.k = k
        self.book_sim = book_sim
        self.novelty_scores = novelty_scores
        if training_ratings is not None:
            self.training_ratings = training_ratings
            self.num_users = len(self.training_ratings.user_id.unique())
            self.num_books = len(self.training_ratings.book_id.unique())
        if testing_ratings is not None:
            self.testing_ratings = testing_ratings
            self.testing_idx = {}
            for user_id in tqdm(testing_ratings.user_id.unique()):
                self.testing_idx[user_id] = testing_ratings[testing_ratings.user_id==user_id].book_id.values
        self.result = {}
    
    def _average_precision(self, pred, truth):
        in_arr = np.in1d(pred, truth)
        score = 0.0
        num_hits = 0.0
        for idx, correct in enumerate(in_arr):
            if correct:
                num_hits += 1
                score += num_hits / (idx + 1)
        return score / min(len(truth), self.k)
    
    def _novelty_score(self, pred):
        # Recommend the top 10 books in novelty score results in ~11
        # Crop the score to 10.0 since it won't change anything and make the score range nicer
        return min(self.novelty_scores.loc[pred].novelty_score.mean(), 10.0)
    
    def _diversity_score(self, pred):
        matrix = self.book_sim.loc[pred, pred].values
        ils = matrix[np.triu_indices(len(pred), k=1)].mean()
        return (1 - ils) * 10
    
    def _personalization_score(self, preds, user_ids, book_ids):
        if len(user_ids) > 3000:
            np.random.seed(42)
            user_ids = np.random.permutation(user_ids)[:3000]
        df = pd.DataFrame(
            data=np.zeros([len(user_ids), len(book_ids)]),
            index=user_ids,
            columns=book_ids
        )
        for user_id in user_ids:
            df.loc[user_id, preds[user_id]] = 1

        matrix = sp.csr_matrix(df.values)

        #calculate similarity for every user's recommendation list
        similarity = cosine_similarity(X=matrix, dense_output=False)

        #get indicies for upper right triangle w/o diagonal
        upper_right = np.triu_indices(similarity.shape[0], k=1)

        #calculate average similarity
        personalization = np.mean(similarity[upper_right])
        
        return (1 - personalization) * 10
    
    def evaluate(self, model):
        print("Calculating recommendations:")
        if len(model.preds) == 0:
            model.fit(self.training_ratings)
        preds = model.all_recommendation()
        user_ids = list(preds.keys())
        book_ids = np.unique(np.concatenate(list(preds.values())))
        ap_sum = 0
        nov_score_sum = 0
        div_score_sum = 0
        print("Calculating metrics:")
        for user_id in tqdm(preds.keys()):
            pred = preds[user_id]
            truth = self.testing_idx[user_id]
            ap_sum += self._average_precision(pred, truth)
            nov_score_sum += self._novelty_score(pred)
            div_score_sum += self._diversity_score(pred)
        
        self.result[model.name] = {}
        self.result[model.name]['Mean Average Precision'] = "%.2f%%" % (ap_sum / self.num_users * 100)
        self.result[model.name]['Coverage'] = "%.2f%%" % (len(book_ids) / self.num_books * 100)
        self.result[model.name]['Novelty Score'] = "%.2f" % (nov_score_sum / self.num_users)
        self.result[model.name]['Diversity Score'] = "%.2f" % (div_score_sum / self.num_users)
        self.result[model.name]['Personalization Score'] = "%.2f" % self._personalization_score(preds, user_ids, book_ids)
        
    def print_result(self):
        print(pd.DataFrame(self.result).loc[['Mean Average Precision', 'Coverage', 'Novelty Score', 'Diversity Score', 'Personalization Score']])

In [34]:
evl = Evaluator(k=10,
                training_ratings=training_ratings,
                testing_ratings=testing_ratings,
                book_sim=book_sim,
                novelty_scores=novelty_scores)

HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))




# 2. Test the new evaluator

In [38]:
class RandomRecommender():
    name = 'Random-based RS'
    preds = {}
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique()
        book_ids = training_ratings.book_id.unique()
        self.preds = {}
        np.random.seed(42)
        for user_id in tqdm(user_ids):
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            recommendable = book_ids[~np.in1d(book_ids, excluded_books)]
            self.preds[user_id] = np.random.permutation(recommendable)[:10]
        
    def recommendation_for_user(self, user_id):
        if user_id not in self.preds:
            return []
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds
    
class PopularityRecommender():
    name = "Popularity-based RS"
    preds = {}
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        five_ratings = training_ratings[training_ratings.rating==5]
        ranked_books = five_ratings.groupby('book_id').count()[['rating']].rename(columns={'rating': 'weight'})
        ranked_books = ranked_books.sort_values(by='weight', ascending=False)
        top_books = ranked_books[:200]
        book_ids = np.array(top_books.index.tolist())
        self.preds = {}
        for user_id in tqdm(user_ids):
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            recommendable = book_ids[~np.in1d(book_ids, excluded_books)]
            self.preds[user_id] = recommendable[:10]
        
    def recommendation_for_user(self, user_id):
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds

In [39]:
rd_rec = RandomRecommender()
pop_rec = PopularityRecommender()

In [40]:
evl.evaluate(rd_rec)
evl.evaluate(pop_rec)
evl.print_result()

Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating recommendations:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


Calculating metrics:


HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Random-based RS Popularity-based RS
Mean Average Precision           0.04%               5.39%
Coverage                       100.00%               0.47%
Novelty Score                     7.62                2.04
Diversity Score                   7.36                4.21
Personalization Score             9.99                3.84


# 3. Wrap up

That's it for our updated evaluator. All of our score range from 1 to 10 and our precision and coverage are percentage-based.

The evaluator's external dependencies:
- A novelty score dataframe
- A book similarity matrix

We will make bundle the evaluator inside a module (a python file) and inject the 2 dependencies in the constructor

Finally, we will save all the useful dataframe that we want to use later on

In [41]:
novelty_scores.reset_index().to_feather('./feather/novelty_scores')