# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.auto import tqdm

# Data

In [3]:
DATA_DIR = os.path.join("..", "data", "final_dataset")

In [4]:
df = pd.read_parquet(os.path.join(DATA_DIR, 'ratings.parquet'))
books= pd.read_parquet(os.path.join(DATA_DIR, 'books_all.parquet'))
df = df[df["isbn"].isin(books["isbn"])]
df = df.query("provided_rating!=0")
df.reset_index(drop=True, inplace=True)
print(f"Number of ratings: {len(df)}")
print(f"Number of unique users: {df['user_id'].nunique()}")
print(f"Number of books: {df['isbn'].nunique()}")
df.head()

Number of ratings: 104756
Number of unique users: 31940
Number of books: 22020


Unnamed: 0,user_id,isbn,provided_rating
0,17,891075275,6
1,17,553264990,5
2,26,449005615,9
3,39,671888587,7
4,69,1853260053,8


# Smaller Dataset

In [5]:
num_ratings = df.groupby('isbn')['provided_rating'].count().sort_values(ascending=False)
most_rated_books = num_ratings.index[:10]
num_ratings.head()

isbn
0316666343    707
0060928336    320
0671027360    269
067976402X    256
0786868716    242
Name: provided_rating, dtype: int64

In [None]:
ratings = pd.DataFrame(df.groupby('isbn')['provided_rating'].mean())
ratings['num_ratings'] = pd.DataFrame(df.groupby('isbn')['provided_rating'].count())
ratings.head()

In [7]:
min_ratings = 5
books_ = ratings.query(f"num_ratings > {min_ratings}").index
print(f"Number of books_ with more than {min_ratings} ratings: {len(books_)}")
print(f"Original number of books_: {df['isbn'].nunique()}")
print(f"Number of rows in the original dataset: {df.shape[0]}")
df_small = df[df['isbn'].isin(books_)]
print(f"Number of rows in the new dataset: {df_small.shape[0]}")

Number of books_ with more than 5 ratings: 3823
Original number of books_: 22020
Number of rows in the original dataset: 104756
Number of rows in the new dataset: 72190


In [8]:
min_ratings = 10
books_ = num_ratings.query(f"num_ratings > {min_ratings}").index
print(f"Number of books_ with more than {min_ratings} ratings: {len(books_)}")
print(f"Original number of books_: {df['isbn'].nunique()}")
print(f"Number of rows in the original dataset: {df.shape[0]}")
df_small = df[df['isbn'].isin(books_)]
unique_users = df_small['user_id'].nunique()
print(f"Number of rows in the new dataset: {df_small.shape[0]}")
print(f"Number of unique users in the new dataset: {unique_users}")

Number of books_ with more than 10 ratings: 1963
Original number of books_: 22020
Number of rows in the original dataset: 104756
Number of rows in the new dataset: 58166
Number of unique users in the new dataset: 22560


In [9]:
rating_mat = df_small.pivot_table(index='user_id',columns='isbn',values='provided_rating')
rating_mat.head()

isbn,0020442602,0020519001,0020532105,002542730X,0060002050,0060002492,0060008024,0060012781,0060080841,0060081961,...,2070360024,2226135022,2253140872,2290302155,3257229534,8408039369,8420483532,8483101610,8495501074,8807813025
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,,,,,,,,,,,...,,,,,,,,,,
26,,,,,,,,,,,...,,,,,,,,,,
39,,,,,,,,,,,...,,,,,,,,,,
78,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,


# Model

In [12]:
most_rated_book = most_rated_books[0]
second_most_rated_book = most_rated_books[1]
most_rated_book_ratings = rating_mat[most_rated_book]
second_most_rated_book_ratings = rating_mat[second_most_rated_book]

most_rated_book_ratings.notna().sum(), second_most_rated_book_ratings.notna().sum()

(707, 320)

In [13]:
def get_similarity_between_two_books(matrix, isbn1, isbn2, min_common = 3):
    ratings_1 = matrix[isbn1]
    ratings_2 = matrix[isbn2]
    temp_df = pd.DataFrame({
        'ratings_1': ratings_1,
        'ratings_2': ratings_2
    })
    temp_df.dropna(inplace=True)
    temp_df.drop_duplicates(inplace=True)
    if len(temp_df) < min_common:
        return None
    return temp_df.corr().iloc[0,1]

In [15]:
get_similarity_between_two_books(rating_mat, most_rated_books[0], most_rated_books[1])

0.1631131900760396

In [17]:
book_isbn = most_rated_books[2]
book_rating = rating_mat[book_isbn]
rated = book_rating[book_rating.notna()]
temp_rating_mat = rating_mat.loc[rated.index]
isbns = temp_rating_mat.columns
correlations = []
for isbn in tqdm(isbns, desc="calculating correlations"):
    if isbn == book_isbn:
        continue
    corr = get_similarity_between_two_books(temp_rating_mat, isbn, book_isbn, min_common = 5)
    correlations.append((isbn, book_isbn, corr))
correlation_df = pd.DataFrame(correlations, columns=['isbn', 'book_isbn', 'corr'])
correlation_df.dropna(inplace=True)

calculating correlations:   0%|          | 0/1963 [00:00<?, ?it/s]

In [18]:
print(correlation_df.shape)
correlation_df.sort_values('corr', ascending=False).head(10)

(55, 3)


Unnamed: 0,isbn,book_isbn,corr
542,0380731851,671027360,0.909718
738,0399501487,671027360,0.900149
651,0385503822,671027360,0.831655
911,0440234743,671027360,0.798549
208,0140293248,671027360,0.786567
1059,0449005615,671027360,0.771744
33,0060392452,671027360,0.760469
1359,0553572997,671027360,0.73994
230,0142002267,671027360,0.730769
870,044021145X,671027360,0.719023


In [23]:
second_isbn = correlation_df.iloc[0]["isbn"]
d = pd.concat([temp_rating_mat[book_isbn], temp_rating_mat[second_isbn]], axis = 1)
d.dropna(inplace=True)
d

Unnamed: 0_level_0,0671027360,0060392452
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
62966,10.0,10.0
68984,8.0,6.0
85757,5.0,7.0
95359,9.0,10.0
110912,9.0,10.0
164323,9.0,8.0
225763,5.0,5.0
245410,7.0,8.0


In [24]:
d2 = pd.concat([books.query(f"isbn == '{book_isbn}'"), books.query(f"isbn == '{second_isbn}'")], axis = 0)
d2.T

Unnamed: 0,13973,195881
num_pages,569,282
star_rating_1,92112,606
star_rating_2,168138,1975
star_rating_3,541297,6291
star_rating_4,877490,4549
star_rating_5,887249,1720
average_rating,3.9,3.32
total_ratings,2566286,15141
total_reviews,1023,54
isbn,0671027360,0060392452
