In [8]:
import pandas as pd
import numpy as np
import feather
from evaluator import Evaluator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

# 1. Setting up

In [3]:
training_ratings = feather.read_dataframe('./data/training_ratings')
testing_ratings = feather.read_dataframe('./data/testing_ratings')
book_profiles = feather.read_dataframe('./data/book_profiles').set_index('book_id').to_sparse(fill_value=0)
novelty_scores = feather.read_dataframe('./data/novelty_scores').set_index('book_id')
books = feather.read_dataframe('./data/books_small').set_index('book_id')

In [4]:
book_sim = pd.DataFrame(
    data = cosine_similarity(book_profiles, book_profiles),
    index = book_profiles.index,
    columns = book_profiles.index
)

book_sim.head()

book_id,27,21,2,18,24,3275,3753,54,337,374,...,5884,5296,8713,7443,6428,7523,4594,9569,9580,8892
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,1.0,0.793039,0.967074,0.935959,0.932615,0.773161,0.826935,0.443948,0.383335,0.145548,...,0.385404,0.279734,0.220817,0.665664,0.613295,0.679846,0.121582,0.397349,0.165843,0.226996
21,0.793039,1.0,0.781584,0.756703,0.802735,0.606846,0.642205,0.371239,0.290013,0.128655,...,0.299041,0.26487,0.181788,0.4833,0.445195,0.511246,0.116972,0.341285,0.114551,0.164359
2,0.967074,0.781584,1.0,0.954254,0.95135,0.779767,0.8387,0.463165,0.400693,0.146866,...,0.399699,0.256885,0.199595,0.665316,0.612849,0.679313,0.122899,0.366076,0.148561,0.172504
18,0.935959,0.756703,0.954254,1.0,0.919456,0.750132,0.813695,0.444069,0.384745,0.159378,...,0.389971,0.264809,0.192761,0.64641,0.60713,0.660081,0.135276,0.368633,0.142581,0.166399
24,0.932615,0.802735,0.95135,0.919456,1.0,0.741594,0.802514,0.488001,0.413613,0.164158,...,0.393429,0.273214,0.19893,0.636164,0.583238,0.648697,0.13925,0.39871,0.146673,0.171259


In [5]:
evl = Evaluator(
    k = 10,
    training_set = training_ratings,
    testing_set = testing_ratings,
    book_sim = book_sim,
    novelty_scores = novelty_scores
)

# 2. Collaborative Filtering RS

The procedure will be followed by the paper Item-based Collaborative Filtering Recommendation Algorithms published by GroupLens. Neighborhood size k will be 30 and Model size l will be 100. The number is only an estimation and should be refined through trial-and-error in actual production. Since optimization is not of our concerns for this project, we won't need to do experiments for varied values of k and l.

In [6]:
# We'll use the whole ratings dataframe for making the similarity matrix
ratings = pd.read_csv('./data/ratings.csv')
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [10]:
user_c = CategoricalDtype(sorted(ratings.user_id.unique()), ordered=True)
book_c = CategoricalDtype(sorted(ratings.book_id.unique()), ordered=True)

row = ratings.user_id.astype(user_c).cat.codes
col = ratings.book_id.astype(book_c).cat.codes
sparse_matrix = csr_matrix((ratings["rating"], (row, col)), \
                           shape=(user_c.categories.size, book_c.categories.size))

sparse_matrix

<53424x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 5976479 stored elements in Compressed Sparse Row format>

In [15]:
ratings_table = pd.SparseDataFrame(data = sparse_matrix,
                                   index = user_c.categories,
                                   columns = book_c.categories)

ratings_table.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
1,,,,5.0,,,,,,4.0,...,,,,,,,,,,
2,,5.0,,,5.0,,,4.0,,5.0,...,,,,,,,,,,
3,,,,3.0,,,,,,,...,,,,,,,,,,
4,,5.0,,4.0,4.0,,4.0,4.0,,5.0,...,,,,,,,,,,
5,,,,,,4.0,,,,,...,,,,,,,,,,


In [49]:
users_mean_rating = ratings.groupby('user_id').mean()[['rating']]
users_mean_rating.head()

Unnamed: 0_level_0,rating
user_id,Unnamed: 1_level_1
1,3.589744
2,4.415385
3,1.736264
4,3.768657
5,4.04


In [53]:
book_ids = ratings.book_id.unique()
print(len(book_ids))
book_ids

10000


array([ 258, 4081,  260, ..., 9580, 8892, 9548], dtype=int64)

In [108]:
top_cf_sim = {}
for book_id1 in book_ids:
    sim_series = pd.Series([])
    for book_id2 in book_ids:
        if book_id1 == book_id2:
            continue
        user_ids = np.array(ratings_table.index)[(ratings_table.loc[:, book_id1].notna() & ratings_table.loc[:, book_id2].notna()).to_dense()]
        centered_ratings_for_book_1 = ratings_table.loc[user_ids, book_id1].to_dense().values - users_mean_rating.loc[user_ids].values
        centered_ratings_for_book_2 = ratings_table.loc[user_ids, book_id2].to_values - users_mean_rating.loc[user_ids].values
        sim_series.loc[book_id2] = cosine_similarity(centered_ratings_for_book_1, centered_ratings_for_book_2)

TypeError: expected dimension <= 1 data

In [114]:
np.array([1, 2, 3]) - np.array([2, 4, 5])

array([-1, -2, -2])

In [120]:
ratings_table.loc[user_ids, 1].to_dense().values - users_mean_rating.loc[user_ids].values.flatten()

array([ 0.5530303 , -0.16304348,  0.49586777, ..., -0.21538462,
       -0.45454545, -0.40601504])

array([3.4469697 , 4.16304348, 3.50413223, ..., 4.21538462, 4.45454545,
       4.40601504])

In [118]:
user_ids = np.array(ratings_table.index)[(ratings_table.loc[:, 1].notna() & ratings_table.loc[:, 2].notna()).to_dense()]
ratings_table.loc[user_ids, 1].values

[4.0, 4.0, 4.0, 5.0, 4.0, 4.0, 3.0, 4.0, 5.0, 4.0, 2.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 2.0, 5.0, 5.0, 2.0, 3.0, 4.0, 4.0, 5.0, 4.0, 5.0, 5.0, 4.0, 4.0, 5.0, 3.0, 4.0, 4.0, 5.0, 3.0, 4.0, 5.0, 5.0, 5.0, 3.0, 4.0, 4.0, 4.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0, 4.0, 4.0, 2.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 3.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 3.0, 4.0, 5.0, 4.0, 5.0, 5.0, 4.0, 3.0, 5.0, 5.0, 3.0, 5.0, 3.0, 5.0, 2.0, 4.0, 4.0, 4.0, ...]
Fill: nan
BlockIndex
Block locations: array([0])
Block lengths: array([13146])

In [95]:
ratings_table.loc[user_ids, [1, 2]]

Unnamed: 0,1,2
9,4.0,4.0
30,4.0,4.0
31,4.0,3.0
32,5.0,4.0
40,4.0,5.0
55,4.0,3.0
58,3.0,3.0
61,4.0,4.0
74,5.0,4.0
75,4.0,5.0


In [104]:
cosine_similarity(pd.Series([0.5530303, 0.49586777]), pd.Series([0.5530303, -0.50413223]))

ValueError: Expected 2D array, got 1D array instead:
array=[0.5530303  0.49586777].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [105]:
pd.Series([0.5530303, 0.49586777])

0    0.553030
1    0.495868
dtype: float64

In [80]:
a = pd.Series([])
a.loc[1] = 2
a.loc[5] = 4
a

1    2
5    4
dtype: int64