In [25]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import feather

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
# Copy from notebook 3.
class Evaluator():
    def __init__(self, k=10, training_set=None, testing_set=None):
        self.k = k
        self.training_set = training_set
        self.testing_set = testing_set
        self.result = {}
    
    def _precision(self):
        precisions = np.array([])
        for user_id in self.preds.columns:
            pred = self.preds[user_id]
            truth = self.testing_set[self.testing_set.user_id==user_id].book_id
            precisions = np.append(precisions, np.in1d(pred, truth).sum() / self.k)
        return precisions.mean()
    
    def _recall(self):
        recalls = np.array([])
        for user_id in self.preds.columns:
            pred = self.preds[user_id]
            truth = self.testing_set[self.testing_set.user_id==user_id].book_id
            recalls = np.append(recalls, np.in1d(pred, truth).sum() / truth.count())
        return recalls.mean()
    
    def evaluate(self, model):
        model.fit(self.training_set)
        self.preds = model.recommendation()
        self.result['precision'] = self._precision()
        self.result['recall'] = self._recall()
        
    def print_result(self):
        print(self.result)

In [3]:
# Import the data
books = feather.read_dataframe('./data/books_small')
training_ratings = feather.read_dataframe('./data/training_ratings')
testing_ratings = feather.read_dataframe('./data/testing_ratings')
(books.shape, training_ratings.shape, testing_ratings.shape)

((9590, 23), (97889, 3), (12445, 3))

# Random Recommender

In [48]:
class RandomRecommender():
    def fit(self, training_ratings, excluded_book_ids=None):
        self.user_ids = training_ratings.user_id.unique().tolist()
        self.book_ids = training_ratings.book_id.unique().tolist()
        if excluded_book_ids == None:
            self.books_excluded_for_user = {user_id: training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist() for user_id in self.user_ids}
        else:
            self.books_excluded_for_user = excluded_book_ids
        self.preds = pd.DataFrame()
        
    def recommendation_for_user(self, user_id):
        np.random.seed(42)
        if user_id not in self.preds:
            self.preds[user_id] = np.random.choice(np.setdiff1d(self.book_ids, self.books_excluded_for_user[user_id]), 10, replace=False)
        return self.preds[user_id]
    
    def recommendation(self):
        for user_id in self.user_ids:
            if user_id not in self.preds:
                self.preds[user_id] = self.recommendation_for_user(user_id)  
        return self.preds

In [61]:
evl = Evaluator(k=10, training_set=training_ratings, testing_set=testing_ratings)
rd_rec = RandomRecommender()

In [62]:
evl.evaluate(rd_rec)

In [63]:
evl.print_result()

{'precision': 0.0017293997965412006, 'recall': 0.001079580456620326}


# Popular-based Recommender

IMDb formula:

Popularity weighted rating = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

Where: 

R = average for the movie (mean) = (rating)

v = number of votes for the movie = (votes)

m = minimum votes required to be listed in the Top Rated list 

C = the mean vote across the whole report


In [85]:
C = books.average_rating.mean()
m = books.ratings_count.quantile(0.95)

top_books = books[books.ratings_count > m]

def weighted_rating(x):
    v = x.ratings_count
    R = x.average_rating
    return (v/(v+m) * R) + (m/(m+v) * C)

top_books['weighted_rating'] = top_books.apply(weighted_rating, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [91]:
top_books.sort_values(by=['weighted_rating'], ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


So we came up with 10 most popular books to recommend

In [92]:
top_books.head(10)[['book_id', 'title', 'weighted_rating']]

Unnamed: 0,book_id,title,weighted_rating
24,25,Harry Potter and the Deathly Hallows (Harry Po...,4.554188
26,27,Harry Potter and the Half-Blood Prince (Harry ...,4.4888
17,18,Harry Potter and the Prisoner of Azkaban (Harr...,4.483602
23,24,Harry Potter and the Goblet of Fire (Harry Pot...,4.481684
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,4.423813
20,21,Harry Potter and the Order of the Phoenix (Har...,4.417691
30,31,The Help,4.40368
38,39,"A Game of Thrones (A Song of Ice and Fire, #1)",4.397096
134,135,"A Storm of Swords (A Song of Ice and Fire, #3)",4.39281
421,422,"Harry Potter Boxset (Harry Potter, #1-7)",4.384479


In [94]:
class PopularityRecommender():
    def fit(self, training_ratings, excluded_book_ids=None):
        # Probably add the process of creating the list here
        self.user_ids = training_ratings.user_id.unique().tolist()
        self.preds = pd.DataFrame()
        
    def recommendation_for_user(self, user_id):
        return top_books.head(10).book_id.tolist()
    
    def recommendation(self):
        for user_id in self.user_ids:
            if user_id not in self.preds:
                self.preds[user_id] = self.recommendation_for_user(user_id)  
        return self.preds

In [95]:
evl = Evaluator(k=10, training_set=training_ratings, testing_set=testing_ratings)
pop_rec = PopularityRecommender()

In [96]:
evl.evaluate(pop_rec)

In [97]:
evl.print_result()

{'precision': 0.04821973550356053, 'recall': 0.04170457844159438}


In [98]:
books.columns

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')