In [1]:
import pandas as pd
import numpy as np
import feather
from evaluator import Evaluator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

# 1. Setting up 

In [2]:
training_ratings = feather.read_dataframe('./data/training_ratings')
testing_ratings = feather.read_dataframe('./data/testing_ratings')
book_profiles = feather.read_dataframe('./data/book_profiles').set_index('book_id')
novelty_scores = feather.read_dataframe('./data/novelty_scores').set_index('book_id')
books = feather.read_dataframe('./data/books_small').set_index('book_id')

In [3]:
book_sim = pd.DataFrame(
    data = cosine_similarity(book_profiles, book_profiles),
    index = book_profiles.index,
    columns = book_profiles.index
)

book_sim.head()

book_id,27,21,2,18,24,3275,3753,54,337,374,...,5884,5296,8713,7443,6428,7523,4594,9569,9580,8892
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.0,0.793039,0.967074,0.935959,0.932615,0.773161,0.826935,0.443948,0.383335,0.145548,...,0.385404,0.279734,0.220817,0.665664,0.613295,0.679846,0.121582,0.397349,0.165843,0.226996
21,0.793039,1.0,0.781584,0.756703,0.802735,0.606846,0.642205,0.371239,0.290013,0.128655,...,0.299041,0.26487,0.181788,0.4833,0.445195,0.511246,0.116972,0.341285,0.114551,0.164359
2,0.967074,0.781584,1.0,0.954254,0.95135,0.779767,0.8387,0.463165,0.400693,0.146866,...,0.399699,0.256885,0.199595,0.665316,0.612849,0.679313,0.122899,0.366076,0.148561,0.172504
18,0.935959,0.756703,0.954254,1.0,0.919456,0.750132,0.813695,0.444069,0.384745,0.159378,...,0.389971,0.264809,0.192761,0.64641,0.60713,0.660081,0.135276,0.368633,0.142581,0.166399
24,0.932615,0.802735,0.95135,0.919456,1.0,0.741594,0.802514,0.488001,0.413613,0.164158,...,0.393429,0.273214,0.19893,0.636164,0.583238,0.648697,0.13925,0.39871,0.146673,0.171259


In [6]:
evl = Evaluator(
    k = 10,
    training_set = training_ratings,
    testing_set = testing_ratings,
    book_sim = book_sim,
    novelty_scores = novelty_scores
)

In [7]:
top_books = feather.read_dataframe('./data/top_books').set_index('book_id')
class PopularityRecommender():
    name = "Popularity-based RS"
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        book_ids = np.array(top_books.index.tolist())
        self.preds = {}
        for user_id in user_ids:
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            recommendable = book_ids[~np.in1d(book_ids, excluded_books)]
            self.preds[user_id] = recommendable[:10]
        
    def recommendation_for_user(self, user_id):
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds

In [8]:
%%time
pop_rec = PopularityRecommender()
evl.evaluate(pop_rec)
evl.print_result()

                       Popularity-based RS
Mean Average Precision               2.48%
Coverage                             0.25%
Novelty Score                         2.69
Diversity Score                       5.94
Personalization Score                 2.35
Wall time: 7.75 s


# 2. Content-based recommender system

Our Content-based RS is going to be simple, using a nearest neighbor approach using book profiles and user-rated-5 books in the training set

First we will build the user profiles first

In [19]:
%%time
def build_user_profiles(training_ratings, book_profiles):
    user_profiles = pd.DataFrame(columns=book_profiles.columns)
    user_ids = training_ratings.user_id.unique()
    list_of_5_ratings = training_ratings[training_ratings.rating==5].groupby('user_id')['book_id'].apply(list)
    for user_id in user_ids:
        user_profiles.loc[user_id] = book_profiles.loc[list_of_5_ratings.loc[user_id]].sum()
    return pd.DataFrame(
        data = preprocessing.normalize(user_profiles.values),
        columns = user_profiles.columns,
        index = user_profiles.index
    )

user_profiles = build_user_profiles(training_ratings, book_profiles)

Wall time: 29.9 s


In [10]:
user_profiles.head()

Unnamed: 0,19th-century,20th-century,abandoned,action,action-adventure,adult,adult-fiction,adventure,africa,agatha-christie,...,NanSilver,PeterMatthiessen,StevePieczenik,TerriBlackstock,JohnRawls,OscarHijuelos,BenOkri,MilesCameron,IanMortimer,PeggyOrenstein
80,0.0,0.163132,0.122763,0.0,0.0,0.117316,0.074145,0.032049,0.041004,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
361,0.042644,0.105118,0.082549,0.0,0.0,0.178995,0.133917,0.042714,0.0,0.012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384,0.0,0.058989,0.12676,0.007301,0.004618,0.24563,0.199378,0.042252,0.084424,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,0.022883,0.094497,0.135923,0.009889,0.008559,0.174072,0.138972,0.0599,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
624,0.0,0.110668,0.071442,0.0,0.0,0.127079,0.056899,0.090005,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
class ContentBasedRecommender():
    name = "Content-based RS"
    
    def __init__(self, item_profiles):
        self.item_profiles = item_profiles
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        sim = pd.DataFrame(
            data = cosine_similarity(user_profiles, self.item_profiles),
            index = user_profiles.index,
            columns = self.item_profiles.index
        )
        self.preds = {}
        for user_id in user_ids:
            book_ids = np.array(sim.loc[user_id].sort_values(ascending=False).index)
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            recommendable = book_ids[~np.in1d(book_ids, excluded_books)]
            self.preds[user_id] = recommendable[:10]
        
    def recommendation_for_user(self, user_id):
        if user_id not in self.preds:
            return []
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds

In [20]:
%%time
ctb_rec = ContentBasedRecommender(book_profiles)
evl.evaluate(ctb_rec)
evl.print_result()

                       Popularity-based RS Content-based RS
Mean Average Precision               2.48%            7.62%
Coverage                             0.25%           23.91%
Novelty Score                         2.69             5.67
Diversity Score                       5.94             2.49
Personalization Score                 2.35             9.84
Wall time: 10.8 s


In [22]:
books.loc[training_ratings[(training_ratings.user_id==80) & (training_ratings.rating==5)].book_id].title

book_id
213                                     The Metamorphosis
162                                          The Stranger
3048                                       My Name is Red
4336     Labyrinths:  Selected Stories and Other Writings
4468    Haroun and the Sea of Stories (Khalifa Brother...
Name: title, dtype: object

In [178]:
books.loc[testing_ratings[testing_ratings.user_id==80].book_id].title

book_id
1055    Midnight's Children
872              The Plague
Name: title, dtype: object

In [177]:
ctb_rec.recommendation_for_user(80)

array([2097, 3020, 1055, 1015, 9754, 2024, 9460, 4504,  576, 2810],
      dtype=int64)

In [179]:
books.loc[ctb_rec.recommendation_for_user(80)].title

book_id
2097    Demian. Die Geschichte von Emil Sinclairs Jugend
3020                 The Metamorphosis and Other Stories
1055                                 Midnight's Children
1015                                         Steppenwolf
9754                           Invitation to a Beheading
2024                                            The Fall
9460                                 Too Loud a Solitude
4504                                     Winesburg, Ohio
576                                              Candide
2810                                           The Magus
Name: title, dtype: object

In [184]:
to_read = pd.read_csv('./data/to_read.csv')

In [187]:
books.loc[to_read[to_read.user_id==80].book_id].title

book_id
8367    Embroideries
Name: title, dtype: object

# 3. Playground

In [27]:
class DefaultPopularityRecommender():
    name = "Popularity-based RS ver2"
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        book_ids = np.array(books.index.tolist())
        self.preds = {}
        for user_id in user_ids:
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            recommendable = book_ids[~np.in1d(book_ids, excluded_books)]
            self.preds[user_id] = recommendable[:10]
        
    def recommendation_for_user(self, user_id):
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds

In [28]:
%%time
def_pop_rec = DefaultPopularityRecommender()
evl.evaluate(def_pop_rec)
evl.print_result()

                       Popularity-based RS Content-based RS  \
Mean Average Precision               2.48%            7.62%   
Coverage                             0.25%           23.91%   
Novelty Score                         2.69             5.67   
Diversity Score                       5.94             2.49   
Personalization Score                 2.35             9.84   

                       Popularity-based RS ver2  
Mean Average Precision                    3.92%  
Coverage                                  0.38%  
Novelty Score                              1.82  
Diversity Score                            5.52  
Personalization Score                      3.78  
Wall time: 8.69 s


In [29]:
top_books[['title', 'ratings_count', 'average_rating', 'weighted_rating']]

Unnamed: 0_level_0,title,ratings_count,average_rating,weighted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,Harry Potter and the Sorcerer's Stone (Harry P...,4602479,4.44,4.419512
31,The Help,1531753,4.45,4.392109
39,"A Game of Thrones (A Song of Ice and Fire, #1)",1319204,4.45,4.384036
192,The Name of the Wind (The Kingkiller Chronicle...,400101,4.55,4.349967
1,"The Hunger Games (The Hunger Games, #1)",4780653,4.34,4.324357
19,The Fellowship of the Ring (The Lord of the Ri...,1766803,4.34,4.300449
47,The Book Thief,1159741,4.36,4.299851
85,The Giving Tree,702332,4.38,4.285011
267,The Nightingale,253606,4.54,4.280549
144,"Unbroken: A World War II Story of Survival, Re...",487775,4.40,4.269676


In [26]:
books[['title', 'ratings_count', 'average_rating']]

Unnamed: 0_level_0,title,ratings_count,average_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"The Hunger Games (The Hunger Games, #1)",4780653,4.34
2,Harry Potter and the Sorcerer's Stone (Harry P...,4602479,4.44
3,"Twilight (Twilight, #1)",3866839,3.57
4,To Kill a Mockingbird,3198671,4.25
5,The Great Gatsby,2683664,3.89
6,The Fault in Our Stars,2346404,4.26
7,The Hobbit,2071616,4.25
8,The Catcher in the Rye,2044241,3.79
9,"Angels & Demons (Robert Langdon, #1)",2001311,3.85
10,Pride and Prejudice,2035490,4.24
