# Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances

# Data

In [4]:
DATA_DIR = os.path.join("..", "data", "final_dataset")

In [5]:
df = pd.read_parquet(os.path.join(DATA_DIR, 'ratings.parquet'))
books= pd.read_parquet(os.path.join(DATA_DIR, 'books_all.parquet'))
df = df[df["isbn"].isin(books["isbn"])]
df = df.query("provided_rating!=0")
df.reset_index(drop=True, inplace=True)
print(f"Number of ratings: {len(df)}")
print(f"Number of unique users: {df['user_id'].nunique()}")
print(f"Number of books: {df['isbn'].nunique()}")
df.head()

Number of ratings: 104756
Number of unique users: 31940
Number of books: 22020


Unnamed: 0,user_id,isbn,provided_rating
0,17,891075275,6
1,17,553264990,5
2,26,449005615,9
3,39,671888587,7
4,69,1853260053,8


## Smaller Dataset

In [6]:
num_ratings = df.groupby('isbn')['provided_rating'].count().sort_values(ascending=False)
most_rated_books = num_ratings.index[:10]
num_ratings.head()

isbn
0316666343    707
0060928336    320
0671027360    269
067976402X    256
0786868716    242
Name: provided_rating, dtype: int64

In [11]:
df.groupby('user_id')['provided_rating'].count().sort_values(ascending=False)

user_id
11676     1593
98391      595
189835     371
76499      333
153662     322
          ... 
59675        1
157184       1
59685        1
59697        1
278854       1
Name: provided_rating, Length: 31940, dtype: int64

In [14]:
ratings = pd.DataFrame(df.groupby('isbn')['provided_rating'].mean())
ratings['num_ratings'] = pd.DataFrame(df.groupby('isbn')['provided_rating'].count())
ratings.head()

Unnamed: 0_level_0,provided_rating,num_ratings
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
2163578,5.0,1
2190915,9.5,2
2210479,6.0,1
2222469,8.0,1
2241358,8.0,1


In [15]:
min_ratings = 5
books_ = ratings.query(f"num_ratings > {min_ratings}").index
print(f"Number of books_ with more than {min_ratings} ratings: {len(books_)}")
print(f"Original number of books_: {df['isbn'].nunique()}")
print(f"Number of rows in the original dataset: {df.shape[0]}")
df_small = df[df['isbn'].isin(books_)]
print(f"Number of rows in the new dataset: {df_small.shape[0]}")

Number of books_ with more than 5 ratings: 3823
Original number of books_: 22020
Number of rows in the original dataset: 104756
Number of rows in the new dataset: 72190


In [17]:
min_ratings = 10
books_ = ratings.query(f"num_ratings > {min_ratings}").index
print(f"Number of books_ with more than {min_ratings} ratings: {len(books_)}")
print(f"Original number of books_: {df['isbn'].nunique()}")
print(f"Number of rows in the original dataset: {df.shape[0]}")
df_small = df[df['isbn'].isin(books_)]
unique_users = df_small['user_id'].nunique()
print(f"Number of rows in the new dataset: {df_small.shape[0]}")
print(f"Number of unique users in the new dataset: {unique_users}")

Number of books_ with more than 10 ratings: 1963
Original number of books_: 22020
Number of rows in the original dataset: 104756
Number of rows in the new dataset: 58166
Number of unique users in the new dataset: 22560


## Preprocessing

In [45]:
n_users = df_small.user_id.nunique()
n_items = df_small.isbn.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 22560
Num of Movies: 1963


In [55]:
user_id_map = dict(zip(df_small.user_id.unique(), list(range(n_users))))
book_id_map = dict(zip(df_small.isbn.unique(), list(range(n_items))))
user_id_map_df  = pd.DataFrame(
    {
        "user_id":user_id_map.keys(),
        "user_id_new": user_id_map.values(),
    }
)
book_id_map_df  = pd.DataFrame(
    {
        "isbn":book_id_map.keys(),
        "isbn_new": book_id_map.values(),
    }
)
df_small["user_id"] = df_small["user_id"].map(user_id_map)
df_small["isbn"] = df_small["isbn"].map(book_id_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small["user_id"] = df_small["user_id"].map(user_id_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small["isbn"] = df_small["isbn"].map(book_id_map)


# Model

In [57]:
train_data, test_data = train_test_split(df_small, test_size=0.25)

In [58]:
train_mat = train_data.pivot_table(index='user_id', columns='isbn', values='provided_rating').fillna(0)
test_mat = test_data.pivot_table(index='user_id', columns='isbn', values='provided_rating').fillna(0)

In [59]:
user_similarity = pairwise_distances(train_mat, metric='cosine')
item_similarity = pairwise_distances(train_mat.T, metric='cosine')

In [65]:
user_similarity.shape, item_similarity.shape

((18672, 18672), (1963, 1963))

In [72]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [86]:
item_prediction = predict(train_mat.values, item_similarity, type='item')
user_prediction = predict(train_mat.values, user_similarity, type='user')

In [87]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [88]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_mat.values)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_mat.values)))

User-based CF RMSE: 7.963856306151568
Item-based CF RMSE: 7.966272299339274


In [109]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [117]:
train_mat.values.shape

(18672, 1963)

In [125]:
M = train_mat.values

In [127]:
np.dot(M.T, M)

array([[1026.,    0.,    0., ...,    0.,    0.,    0.],
       [   0., 6822.,    0., ...,    0.,   63.,    0.],
       [   0.,    0., 1485., ...,    0.,    0.,    0.],
       ...,
       [   0.,    0.,    0., ...,  498.,    0.,    0.],
       [   0.,   63.,    0., ...,    0.,  495.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,  604.]])

In [128]:
np.dot(M, M.T)

array([[36.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 81.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 49., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., 36.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., 25.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., 64.]])

In [130]:
from numpy.linalg import svd
u, s, vt = svd(train_mat.values, full_matrices=False)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_mat.values)))

User-based CF MSE: 7.970228954084404


In [139]:
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))
 
highest_similarity = -np.inf
highest_sim_col = -1
for col in range(1,vt.shape[1]):
    similarity = cosine_similarity(vt[:,0], vt[:,col])
    if similarity > highest_similarity:
        highest_similarity = similarity
        highest_sim_col = col
 
print("Column %d is most similar to column 0" % highest_sim_col)

Column 332 is most similar to column 0


In [140]:
highest_similarity

4.102621020685147e-16

In [137]:
(vt[:, 0]@vt[:, 1])/(np.linalg.norm(vt[:, 0])*np.linalg.norm(vt[:, 1]))

7.372574772901429e-17

In [138]:
(vt[:, 0]@vt[:, 111])/(np.linalg.norm(vt[:, 0])*np.linalg.norm(vt[:, 111]))

-1.3877787807814457e-16

In [131]:
u.shape, s.shape, vt.shape

((18672, 1963), (1963,), (1963, 1963))

In [118]:
train_mat.shape, u.shape, s.shape, vt.shape, s_diag_matrix.shape

((18672, 1963), (18672, 20), (20,), (20, 1963), (20, 20))

In [124]:
u[:, 0].sum()

6.583999588269197

In [121]:
np.dot(u, vt).sum()

1448.3715529218027

In [112]:
s

array([ 97.74726998,  99.58960591, 100.69085555, 102.15672598,
       102.62926858, 104.23556462, 105.07571802, 105.59436242,
       106.80904882, 107.74733948, 109.90060574, 111.88388   ,
       115.40445331, 116.53484794, 118.66014087, 121.01264356,
       127.31038341, 139.98254513, 191.92553525, 220.74064862])

In [115]:
train_mat.values

array([[6., 0., 0., ..., 0., 0., 0.],
       [0., 9., 0., ..., 0., 0., 0.],
       [0., 0., 7., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [116]:
(np.dot(np.dot(u, s_diag_matrix), vt))

array([[ 8.27140881e-03,  8.75903989e-03, -2.84281293e-03, ...,
         3.54610696e-19,  1.46381976e-03, -9.31381750e-04],
       [ 1.31385598e-02,  1.59702384e-01, -1.95126404e-02, ...,
         1.97560243e-18,  3.27939362e-03, -5.12065381e-03],
       [-3.31661509e-03, -1.51764981e-02,  2.27507442e-02, ...,
        -2.22316789e-18, -3.80513158e-04,  2.68852015e-03],
       ...,
       [ 5.10681271e-03,  3.71445908e-02, -3.34748985e-03, ...,
         4.34220762e-19,  1.36896697e-03, -2.75107947e-04],
       [ 6.31754541e-04, -3.82561136e-03, -5.69476821e-04, ...,
         1.00360971e-18,  5.27169051e-04,  3.68500242e-04],
       [ 1.44219086e-03, -1.50396559e-02,  1.19180450e-02, ...,
        -1.59040351e-19, -3.07661845e-04,  6.63603267e-03]])