In [6]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import feather
from tqdm import tqdm_notebook as tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Copied from notebook 3
class Evaluator():
    def __init__(self, k=10, training_ratings=None, testing_ratings=None):
        self.k = k
        if training_ratings is not None:
            self.training_ratings = training_ratings
            self.num_users = len(self.training_ratings.user_id.unique())
            self.num_books = len(self.training_ratings.book_id.unique())
        if testing_ratings is not None:
            self.testing_ratings = testing_ratings
            self.testing_idx = {}
            for user_id in testing_ratings.user_id.unique():
                self.testing_idx[user_id] = testing_ratings[testing_ratings.user_id==user_id].book_id.values
        self.result = {}
    
    def _average_precision(self, pred, truth):
        in_arr = np.in1d(pred, truth)
        score = 0.0
        num_hits = 0.0
        for idx, correct in enumerate(in_arr):
            if correct:
                num_hits += 1
                score += num_hits / (idx + 1)
        return score / min(len(truth), self.k)
    
    def evaluate(self, model):
        model.fit(self.training_ratings)
        preds = model.all_recommendation()
        user_ids = list(preds.keys())
        ap_sum = 0
        for user_id in preds.keys():
            pred = preds[user_id][:self.k]
            truth = self.testing_idx[user_id]
            ap_sum += self._average_precision(pred, truth)
        
        self.result[model.name] = {}
        self.result[model.name]['Mean Average Precision'] = "%.2f%%" % (ap_sum / self.num_users * 100)
        
    def print_result(self):
        print(pd.DataFrame(self.result).loc[['Mean Average Precision']])

In [3]:
# Import the data
books = feather.read_dataframe('./feather/books')
training_ratings = feather.read_dataframe('./feather/training_ratings')
testing_ratings = feather.read_dataframe('./feather/testing_ratings')
(books.shape, training_ratings.shape, testing_ratings.shape)

((10000, 4), (5206758, 3), (660656, 3))

# Random Recommender

In [13]:
book_ids = training_ratings.book_id.unique()
excluded_books = training_ratings[training_ratings.user_id==80].book_id.unique()

In [17]:
(len(book_ids), len(excluded_books))

(10000, 62)

In [15]:
len(book_ids[~np.in1d(book_ids, excluded_books)])

9938

In [18]:
class RandomRecommender():
    name = 'Random-based RS'
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique()
        book_ids = training_ratings.book_id.unique()
        self.preds = {}
        np.random.seed(42)
        for user_id in tqdm(user_ids):
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            recommendable = book_ids[~np.in1d(book_ids, excluded_books)]
            self.preds[user_id] = np.random.permutation(recommendable)[:10]
        
    def recommendation_for_user(self, user_id):
        if user_id not in self.preds:
            return []
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds

In [8]:
evl = Evaluator(k=10, training_ratings=training_ratings, testing_ratings=testing_ratings)

In [19]:
rd_rec = RandomRecommender()
evl.evaluate(rd_rec)
evl.print_result()

HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Random-based RS
Mean Average Precision           0.04%


# Popular-based Recommender

In [27]:
five_ratings = training_ratings[training_ratings.rating==5]
ranked_books = five_ratings.groupby('book_id').count()[['rating']].rename(columns={'rating': 'weight'})
ranked_books = ranked_books.sort_values(by='weight', ascending=False)
ranked_books.head(10)

Unnamed: 0_level_0,weight
book_id,Unnamed: 1_level_1
2,8047
1,7522
4,6754
25,6586
18,5979
24,5886
27,5818
21,5487
23,4887
7,4749


In [28]:
books.loc[ranked_books.index[:10]]

Unnamed: 0_level_0,book_id,goodreads_book_id,authors,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,3,41865,Stephenie Meyer,"Twilight (Twilight, #1)"
1,2,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...
4,5,4671,F. Scott Fitzgerald,The Great Gatsby
25,26,968,Dan Brown,"The Da Vinci Code (Robert Langdon, #2)"
18,19,34,J.R.R. Tolkien,The Fellowship of the Ring (The Lord of the Ri...
24,25,136251,"J.K. Rowling, Mary GrandPré",Harry Potter and the Deathly Hallows (Harry Po...
27,28,7624,William Golding,Lord of the Flies
21,22,12232938,Alice Sebold,The Lovely Bones
23,24,6,"J.K. Rowling, Mary GrandPré",Harry Potter and the Goblet of Fire (Harry Pot...
7,8,5107,J.D. Salinger,The Catcher in the Rye


In [32]:
class PopularityRecommender():
    name = "Popularity-based RS"
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        top_books = ranked_books[:100]
        book_ids = np.array(top_books.index.tolist())
        self.preds = {}
        for user_id in tqdm(user_ids):
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            recommendable = book_ids[~np.in1d(book_ids, excluded_books)]
            self.preds[user_id] = recommendable[:10]
        
    def recommendation_for_user(self, user_id):
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds

In [33]:
pop_rec = PopularityRecommender()
evl.evaluate(pop_rec)
evl.print_result()

HBox(children=(IntProgress(value=0, max=52363), HTML(value='')))


                       Random-based RS Popularity-based RS
Mean Average Precision           0.04%               5.39%
