In [31]:
import pandas as pd
import numpy as np
import feather
from evaluator import Evaluator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

# 1. Setting up

In [32]:
training_ratings = feather.read_dataframe('./data/training_ratings')
testing_ratings = feather.read_dataframe('./data/testing_ratings')
book_profiles = feather.read_dataframe('./data/book_profiles').set_index('book_id').to_sparse(fill_value=0)
novelty_scores = feather.read_dataframe('./data/novelty_scores').set_index('book_id')
books = feather.read_dataframe('./data/books_small').set_index('book_id')

In [33]:
book_sim = pd.DataFrame(
    data = cosine_similarity(book_profiles, book_profiles),
    index = book_profiles.index,
    columns = book_profiles.index
)

book_sim.head()

book_id,27,21,2,18,24,3275,3753,54,337,374,...,5884,5296,8713,7443,6428,7523,4594,9569,9580,8892
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.0,0.793039,0.967074,0.935959,0.932615,0.773161,0.826935,0.443948,0.383335,0.145548,...,0.385404,0.279734,0.220817,0.665664,0.613295,0.679846,0.121582,0.397349,0.165843,0.226996
21,0.793039,1.0,0.781584,0.756703,0.802735,0.606846,0.642205,0.371239,0.290013,0.128655,...,0.299041,0.26487,0.181788,0.4833,0.445195,0.511246,0.116972,0.341285,0.114551,0.164359
2,0.967074,0.781584,1.0,0.954254,0.95135,0.779767,0.8387,0.463165,0.400693,0.146866,...,0.399699,0.256885,0.199595,0.665316,0.612849,0.679313,0.122899,0.366076,0.148561,0.172504
18,0.935959,0.756703,0.954254,1.0,0.919456,0.750132,0.813695,0.444069,0.384745,0.159378,...,0.389971,0.264809,0.192761,0.64641,0.60713,0.660081,0.135276,0.368633,0.142581,0.166399
24,0.932615,0.802735,0.95135,0.919456,1.0,0.741594,0.802514,0.488001,0.413613,0.164158,...,0.393429,0.273214,0.19893,0.636164,0.583238,0.648697,0.13925,0.39871,0.146673,0.171259


In [34]:
evl = Evaluator(
    k = 10,
    training_set = training_ratings,
    testing_set = testing_ratings,
    book_sim = book_sim,
    novelty_scores = novelty_scores
)

# 2. Collaborative Filtering RS

The procedure will be followed by the paper Item-based Collaborative Filtering Recommendation Algorithms published by GroupLens. The prediction will be done a little different for performance concerns. 30 most similar books to each user's 5-ratings book will be selected as recommendable items and then their predicted rating for them will be calculated.

In [35]:
# We'll use the whole ratings dataframe for making the similarity matrix
ratings = pd.read_csv('./data/ratings.csv')
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [36]:
users_mean_rating = ratings.groupby('user_id').mean()[['rating']]
users_mean_rating.head()

Unnamed: 0_level_0,rating
user_id,Unnamed: 1_level_1
1,3.589744
2,4.415385
3,1.736264
4,3.768657
5,4.04


In [37]:
ratings['centered_rating'] = ratings[['rating']] - users_mean_rating.loc[ratings.user_id].values
ratings.head()

Unnamed: 0,user_id,book_id,rating,centered_rating
0,1,258,5,1.410256
1,2,4081,4,-0.415385
2,2,260,5,0.584615
3,2,9296,5,0.584615
4,2,2318,3,-1.415385


In [38]:
user_c = CategoricalDtype(sorted(ratings.user_id.unique()), ordered=True)
book_c = CategoricalDtype(sorted(ratings.book_id.unique()), ordered=True)

row = ratings.user_id.astype(user_c).cat.codes
col = ratings.book_id.astype(book_c).cat.codes
sparse_matrix = csr_matrix((ratings["centered_rating"], (row, col)), \
                           shape=(user_c.categories.size, book_c.categories.size))

sparse_matrix

<53424x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 5976479 stored elements in Compressed Sparse Row format>

In [39]:
cf_sim = pd.DataFrame(
    data = cosine_similarity(sparse_matrix.T, sparse_matrix.T),
    index = book_c.categories,
    columns = book_c.categories)
cf_sim.shape

(10000, 10000)

In [40]:
cf_sim.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
1,1.0,0.219846,-0.030342,0.068965,-0.037641,0.124883,0.008722,-0.042303,0.02437,0.081773,...,0.000583,0.000259,-0.00317,0.009361,3e-05,-0.004743,-0.008482,-0.003357,-0.008805,-0.000506
2,0.219846,1.0,-0.165438,0.105912,-0.051877,0.072421,0.090009,-0.066261,-0.001201,0.110649,...,-0.003374,-0.002885,0.00942,0.005849,0.001385,-0.002147,0.002363,0.001087,-0.002718,-1.1e-05
3,-0.030342,-0.165438,1.0,-0.117277,0.017926,-0.036126,-0.078446,0.0459,0.09972,-0.053001,...,0.001944,0.002571,-0.001174,-0.008087,0.004327,-0.002859,-0.003937,-0.005496,-0.002997,-0.006616
4,0.068965,0.105912,-0.117277,1.0,0.068429,0.044079,0.054306,0.039292,-0.080769,0.137959,...,-0.001739,-0.005643,0.002721,-0.005675,-0.012013,-0.001773,0.006072,0.008745,-0.001713,0.006923
5,-0.037641,-0.051877,0.017926,0.068429,1.0,0.000708,-0.031094,0.165989,-0.007932,0.021908,...,-0.00406,0.003121,-0.003067,0.001758,0.009504,-0.003627,0.003723,0.002042,0.000852,0.009425


In [41]:
books.loc[2].title

"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)"

In [42]:
books.loc[cf_sim.loc[2].sort_values(ascending=False)[1:11].index].title

18     Harry Potter and the Prisoner of Azkaban (Harr...
23     Harry Potter and the Chamber of Secrets (Harry...
24     Harry Potter and the Goblet of Fire (Harry Pot...
27     Harry Potter and the Half-Blood Prince (Harry ...
25     Harry Potter and the Deathly Hallows (Harry Po...
21     Harry Potter and the Order of the Phoenix (Har...
1                The Hunger Games (The Hunger Games, #1)
31                                              The Help
422             Harry Potter Boxset (Harry Potter, #1-7)
17                  Catching Fire (The Hunger Games, #2)
Name: title, dtype: object

Seems okay for now

In [99]:
top_sim_books = {}
trace_back = {}
book_ids = ratings.book_id.unique()
for book_id in book_ids:
    top_sim_books[book_id] = cf_sim.loc[book_id].sort_values(ascending=False)[1:31]
    for traceback_id in top_sim_books[book_id].index:
        if traceback_id not in trace_back:
            trace_back[traceback_id] = []
        trace_back[traceback_id].append(book_id)

In [169]:
from tqdm import tqdm

In [44]:
training_book_ids = training_ratings.book_id.unique()

In [208]:
class ItemBasedCFRecommender():
    name = "Item-based CF RS"
    
    def fit(self, training_ratings):
        user_ids = training_ratings.user_id.unique().tolist()
        self.preds = {}
        training_book_ids = training_ratings.book_id.unique()
        for user_id in tqdm(user_ids):
            user_ratings = training_ratings[training_ratings.user_id==user_id].set_index('book_id')[['rating']]
            excluded_books = training_ratings[training_ratings.user_id==user_id].book_id.unique().tolist()
            most_similar_books = np.array([])
            for book_id in list_of_5_ratings[user_id]:
                most_similar_books = np.concatenate([most_similar_books, trace_back[book_id]])

            most_similar_books = np.unique(most_similar_books)
            recommendable = most_similar_books[~np.in1d(most_similar_books, excluded_books)]
            predicted_ratings = pd.DataFrame(columns=['rating', 'total_sim'])
            for book_id in recommendable:
                if book_id not in training_book_ids:
                    continue
                u_ratings = user_ratings.reindex(top_sim_books[book_id].index)
                u_ratings['sim'] = top_sim_books[book_id]
                u_ratings.dropna(inplace=True)
                total_sim = u_ratings.sim.sum()
                predicted_rating = (u_ratings.rating * u_ratings.sim).sum() / u_ratings.sim.abs().sum()
                predicted_ratings.loc[int(book_id)] = [predicted_rating, total_sim]
            self.preds[user_id] = predicted_ratings.sort_values(by=['rating', 'total_sim'], ascending=[False, False]).index.tolist()[:10]
        
    def recommendation_for_user(self, user_id):
        if user_id not in self.preds:
            return []
        return self.preds[user_id]
    
    def all_recommendation(self):
        return self.preds

In [None]:
user_ratings = training_ratings[training_ratings.user_id==80].set_index('book_id')[['rating']]
excluded_books = training_ratings[training_ratings.user_id==80].book_id.unique().tolist()
most_similar_books = np.array([])
for book_id in list_of_5_ratings[80]:
    most_similar_books = np.concatenate([most_similar_books, trace_back[book_id]])

most_similar_books = np.unique(most_similar_books)
recommendable = most_similar_books[~np.in1d(most_similar_books, excluded_books)]

predicted_ratings = pd.DataFrame(columns=['rating', 'total_sim'])
for book_id in recommendable:
    u_ratings = user_ratings.reindex(top_sim_books[book_id].index)
    u_ratings['sim'] = top_sim_books[book_id]
    u_ratings.dropna(inplace=True)
    total_sim = u_ratings.sim.sum()
    predicted_rating = (u_ratings.rating * u_ratings.sim).sum() / u_ratings.sim.abs().sum()
    predicted_ratings.loc[book_id] = [predicted_rating, total_sim]

In [209]:
ibcf_rec = ItemBasedCFRecommender()
evl.evaluate(ibcf_rec)
evl.print_result()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 983/983 [32:39<00:00,  4.18s/it]


                       Item-based CF RS
Mean Average Precision            2.93%
Coverage                         44.01%
Novelty Score                      6.13
Diversity Score                    6.07
Personalization Score              9.95
