# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.auto import tqdm
from scipy import sparse
import torch

import warnings
warnings.filterwarnings('ignore')

# Data

## Load data

In [2]:
DATA_DIR = os.path.join("..", "data", "final_dataset")

In [3]:
interactions = pd.read_parquet(os.path.join(DATA_DIR, 'ratings.parquet'))
books= pd.read_parquet(os.path.join(DATA_DIR, 'books_all.parquet'))
users = pd.read_parquet(os.path.join(DATA_DIR, 'users.parquet'))
interactions = interactions[interactions["isbn"].isin(books["isbn"])]
interactions = interactions.query("provided_rating!=0")
interactions.reset_index(drop=True, inplace=True)
print(f"Number of ratings: {len(interactions)}")
print(f"Number of unique users: {interactions['user_id'].nunique()}")
print(f"Number of books: {interactions['isbn'].nunique()}")
print(f"Shape of interactions: {interactions.shape}")
print(f"Shape of books: {books.shape}")
print(f"Shape of users: {users.shape}")
interactions.head()

Number of ratings: 104756
Number of unique users: 31940
Number of books: 22020
Shape of interactions: (104756, 3)
Shape of books: (215397, 18)
Shape of users: (278858, 3)


Unnamed: 0,user_id,isbn,provided_rating
0,17,891075275,6
1,17,553264990,5
2,26,449005615,9
3,39,671888587,7
4,69,1853260053,8


We will remove the extra users and books that are not in the interaction.

In [4]:
user_ids = interactions["user_id"].unique().tolist()
isbns = interactions["isbn"].unique().tolist()
users = users.query(f"user_id in {user_ids}")
books = books.query(f"isbn in {isbns}")
print(f"Shape of books: {books.shape}")
print(f"Shape of users: {users.shape}")

Shape of books: (22058, 18)
Shape of users: (31940, 3)


We have left with a very few books and users.

In [5]:
books.head()

Unnamed: 0,num_pages,star_rating_1,star_rating_2,star_rating_3,star_rating_4,star_rating_5,average_rating,total_ratings,total_reviews,isbn,publication_date,title,series,book_number_in_series,authors,categories,format,lang
1,870,12455,37005,211781,604283,1493113,4.5,2358637,29770,0439358078,2004-09-01,Harry Potter and the Order of the Phoenix,Harry Potter,5,[J.K. Rowling],"[Fantasy, Fantasy Books, Fantasy Books for Kids]",CD,en
3,352,11896,49353,288821,706082,1504505,4.42,2560657,244,0439554896,2003-11-01,Harry Potter and the Chamber of Secrets,Harry Potter,2,[J.K. Rowling],"[Fantasy, Fantasy Books, Fantasy Books for Kids]",CD,en
4,435,10128,24849,194848,630534,1749958,4.57,2610317,37093,043965548X,2004-05-01,Harry Potter and the Prisoner of Azkaban,Harry Potter,3,[J.K. Rowling],"[Fantasy, Fantasy Books, Fantasy Books for Kids]",CD,en
8,815,3443,7613,30030,75683,157499,4.37,274268,4119,0345453743,2002-04-30,The Ultimate Hitchhiker's Guide to the Galaxy,Hitchhiker's Guide to the Galaxy,5,[Douglas Adams],"[Contemporary Fiction, Science Fiction, Scienc...",Audio,en
11,55,249,985,3342,2409,1408,3.45,8393,503,0767915062,2002-12-03,Bill Bryson's African Diary,Standalone,1,[Bill Bryson],"[British & Irish History, Guidebooks, Travel W...",CD,en


In [6]:
users.head()

Unnamed: 0,user_id,location,age
16,17,"chesapeake, virginia, usa",
25,26,"bellevue, washington, usa",
38,39,"cary, north carolina, usa",
68,69,"vancouver, british columbia, canada",
77,78,"oakland, california, usa",18.0


## Preprocessing

As a part of preprocessing, we will do the following steps:
1. Create new columns, city, state and country from the location column.
2. Remove the location column.
3. Fill the age column with mean of the group of city, state and country. If some are remaining, use overall mean.
4. Select columns to be used from book details. (For more details, see the next section)

### Users Dataset

In [7]:
users["city"] = users["location"].str.split(", ").str[0]
users["state"] = users["location"].str.split(", ").str[1]
users["country"] = users["location"].str.split(", ").str[2]

In [8]:
groups = users.groupby(["city", "state", "country"]).groups
users_final = pd.DataFrame()
for group in tqdm(groups):
    temp_df = users.loc[groups[group]]
    if len(temp_df["age"].notnull()) == 0:
        users_final = pd.concat([users_final, temp_df], ignore_index=True)
    temp_df["age"].fillna(temp_df["age"].mean(), inplace=True)
    users_final = users_final.append(temp_df)


  0%|          | 0/10425 [00:00<?, ?it/s]

In [9]:
users_final["age"].isna().sum(), users["age"].isna().sum()

(3302, 12423)

In [10]:
users_final["age"].fillna(users_final["age"].mean(), inplace=True)
users_final["age"].isna().sum()

0

In [11]:
users_final = users_final[["user_id", "age", "city", "state", "country"]]
users_final["age"] = users_final["age"].astype(int)
users_final.sort_values(by="user_id", inplace=True)
users_final.reset_index(drop=True, inplace=True)
users_final.head()

Unnamed: 0,user_id,age,city,state,country
0,17,46,chesapeake,virginia,usa
1,26,38,bellevue,washington,usa
2,39,37,cary,north carolina,usa
3,69,34,vancouver,british columbia,canada
4,78,18,oakland,california,usa


We will be taking top 10 cities, states and countries and will be replacing the rest with 'other'.

In [12]:
top_cities = users_final["city"].value_counts().index[:10]
top_states = users_final["state"].value_counts().index[:10]
top_countries = users_final["country"].value_counts().index[:10]

In [13]:
# take top 10 cities, states and countries and replace all other values with "other"
users_final["city"] = users_final["city"].apply(lambda x: x if x in top_cities else "other")
users_final["state"] = users_final["state"].apply(lambda x: x if x in top_states else "other")
users_final["country"] = users_final["country"].apply(lambda x: x if x in top_countries else "other")

In [14]:
users_final.head()

Unnamed: 0,user_id,age,city,state,country
0,17,46,other,other,usa
1,26,38,other,washington,usa
2,39,37,other,other,usa
3,69,34,vancouver,british columbia,canada
4,78,18,other,california,usa


Finally, one-hot encode the city, state and country columns.

In [15]:
users_final = pd.get_dummies(users_final)

In [16]:
users_final.head()

Unnamed: 0,user_id,age,city_chicago,city_houston,city_london,city_new york,city_other,city_portland,city_san diego,city_san francisco,...,country_canada,country_france,country_germany,country_italy,country_netherlands,country_new zealand,country_other,country_spain,country_united kingdom,country_usa
0,17,46,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,26,38,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,39,37,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,69,34,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,78,18,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### The Books Dataset

We will be keeping all the rating columns, the review column and `num_pages` as they are just numbers. We will extract the year from publication date and keep it as a separate column. For series, we will create a boolean column that will be 1 if the book has a series and 0 otherwise. For authors, categories and columns, we will use one hot encoding if feasible.

In [17]:
columns_to_take = [
    "num_pages",
    "star_rating_1",
    "star_rating_2",
    "star_rating_3",
    "star_rating_4",
    "star_rating_5",
    "average_rating",
    "total_ratings",
    "total_reviews",
    "isbn",
    "publication_date",
    "series",
    "book_number_in_series",
    "authors",
    "categories",
    "format",
    "lang",
]

In [18]:
books = books[columns_to_take]
books["publication_date"] = pd.to_datetime(books["publication_date"])
books["publication_year"] = books["publication_date"].dt.year
books.drop(columns=["publication_date"], inplace=True)
books.head().T

Unnamed: 0,1,3,4,8,11
num_pages,870,352,435,815,55
star_rating_1,12455,11896,10128,3443,249
star_rating_2,37005,49353,24849,7613,985
star_rating_3,211781,288821,194848,30030,3342
star_rating_4,604283,706082,630534,75683,2409
star_rating_5,1493113,1504505,1749958,157499,1408
average_rating,4.5,4.42,4.57,4.37,3.45
total_ratings,2358637,2560657,2610317,274268,8393
total_reviews,29770,244,37093,4119,503
isbn,0439358078,0439554896,043965548X,0345453743,0767915062


In [19]:
all_authors = set()
for authors in books["authors"].tolist():
    all_authors.update(authors)

len(all_authors)

9968

There are about 10k distinct authors. I don't think we can use one hot encoding for this.

In [20]:
all_categories = set()
for categories in books["categories"].tolist():
    all_categories.update(categories)

len(all_categories)

1560

Genre are also a lot. We will leave them for now.

In [21]:
all_format = set()
for format in books["format"].tolist():
    all_format.update(format)

len(all_format)

34

In [22]:
all_lang = set()
for lang in books["lang"].tolist():
    if pd.isna(lang):
        continue
    all_lang.update(lang)

len(all_lang)

17

We will leave these columns too. Later, we can try incorporating them by considering only the top 10 authors, genres, etc.

In [23]:
books.drop(columns=["authors", "categories", "format", "lang"], inplace=True)

In [24]:
books["series"] = books["series"].map(lambda x: 0 if x == "Standalone" else 1)

In [25]:
books.head().T

Unnamed: 0,1,3,4,8,11
num_pages,870.0,352.0,435,815.0,55.0
star_rating_1,12455.0,11896.0,10128,3443.0,249.0
star_rating_2,37005.0,49353.0,24849,7613.0,985.0
star_rating_3,211781.0,288821.0,194848,30030.0,3342.0
star_rating_4,604283.0,706082.0,630534,75683.0,2409.0
star_rating_5,1493113.0,1504505.0,1749958,157499.0,1408.0
average_rating,4.5,4.42,4.57,4.37,3.45
total_ratings,2358637.0,2560657.0,2610317,274268.0,8393.0
total_reviews,29770.0,244.0,37093,4119.0,503.0
isbn,439358078.0,439554896.0,043965548X,345453743.0,767915062.0


These are the columns we will be using.

## Mapping the IDs

This section will map the ids and isbns to integers. This will be helpful in creating the dataset.

In [26]:
books.head()

Unnamed: 0,num_pages,star_rating_1,star_rating_2,star_rating_3,star_rating_4,star_rating_5,average_rating,total_ratings,total_reviews,isbn,series,book_number_in_series,publication_year
1,870,12455,37005,211781,604283,1493113,4.5,2358637,29770,0439358078,1,5,2004
3,352,11896,49353,288821,706082,1504505,4.42,2560657,244,0439554896,1,2,2003
4,435,10128,24849,194848,630534,1749958,4.57,2610317,37093,043965548X,1,3,2004
8,815,3443,7613,30030,75683,157499,4.37,274268,4119,0345453743,1,5,2002
11,55,249,985,3342,2409,1408,3.45,8393,503,0767915062,0,1,2002


In [27]:
users_final.head()

Unnamed: 0,user_id,age,city_chicago,city_houston,city_london,city_new york,city_other,city_portland,city_san diego,city_san francisco,...,country_canada,country_france,country_germany,country_italy,country_netherlands,country_new zealand,country_other,country_spain,country_united kingdom,country_usa
0,17,46,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,26,38,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,39,37,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,69,34,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,78,18,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [28]:
interactions.head()

Unnamed: 0,user_id,isbn,provided_rating
0,17,891075275,6
1,17,553264990,5
2,26,449005615,9
3,39,671888587,7
4,69,1853260053,8


In [29]:
n_users = interactions.user_id.nunique()
n_items = interactions.isbn.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 31940
Num of Movies: 22020


In [30]:
user_id_map = dict(zip(interactions.user_id.unique(), list(range(n_users))))
book_id_map = dict(zip(interactions.isbn.unique(), list(range(n_items))))
user_id_map_df  = pd.DataFrame(
    {
        "user_id":user_id_map.keys(),
        "user_id_new": user_id_map.values(),
    }
)
book_id_map_df  = pd.DataFrame(
    {
        "isbn":book_id_map.keys(),
        "isbn_new": book_id_map.values(),
    }
)
interactions["user_id"] = interactions["user_id"].map(user_id_map)
interactions["isbn"] = interactions["isbn"].map(book_id_map)

In [31]:
interactions.head()

Unnamed: 0,user_id,isbn,provided_rating
0,0,0,6
1,0,1,5
2,1,2,9
3,2,3,7
4,3,4,8


In [32]:
users_final["user_id"] = users_final["user_id"].map(user_id_map)
books["isbn"] = books["isbn"].map(book_id_map)

In [33]:
books.rename(columns={"isbn": "book_id"}, inplace=True)

In [34]:
books.reset_index(drop=True, inplace=True)
users_final.reset_index(drop=True, inplace=True)
interactions.reset_index(drop=True, inplace=True)

In [46]:
interactions.rename(columns={"isbn": "book_id"}, inplace=True)

In [35]:
books.head()

Unnamed: 0,num_pages,star_rating_1,star_rating_2,star_rating_3,star_rating_4,star_rating_5,average_rating,total_ratings,total_reviews,book_id,series,book_number_in_series,publication_year
0,870,12455,37005,211781,604283,1493113,4.5,2358637,29770,15885,1,5,2004
1,352,11896,49353,288821,706082,1504505,4.42,2560657,244,7304,1,2,2003
2,435,10128,24849,194848,630534,1749958,4.57,2610317,37093,3161,1,3,2004
3,815,3443,7613,30030,75683,157499,4.37,274268,4119,2728,1,5,2002
4,55,249,985,3342,2409,1408,3.45,8393,503,3485,0,1,2002


In [36]:
users_final.head()

Unnamed: 0,user_id,age,city_chicago,city_houston,city_london,city_new york,city_other,city_portland,city_san diego,city_san francisco,...,country_canada,country_france,country_germany,country_italy,country_netherlands,country_new zealand,country_other,country_spain,country_united kingdom,country_usa
0,0,46,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,38,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,37,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,34,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4,18,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [37]:
interactions.head()

Unnamed: 0,user_id,isbn,provided_rating
0,0,0,6
1,0,1,5
2,1,2,9
3,2,3,7
4,3,4,8


In [38]:
user_id_map_df.head()

Unnamed: 0,user_id,user_id_new
0,17,0
1,26,1
2,39,2
3,69,3
4,78,4


In [39]:
book_id_map_df.rename(columns={"isbn_new": "book_id"}, inplace=True)
book_id_map_df

Unnamed: 0,isbn,book_id
0,0891075275,0
1,0553264990,1
2,0449005615,2
3,0671888587,3
4,1853260053,4
...,...,...
22015,0684174707,22015
22016,020140964X,22016
22017,0770422071,22017
22018,1887166092,22018


Finally, let's save these datasets so that we don't have to do the preprocessing again.

In [47]:
interactions.to_parquet(os.path.join(DATA_DIR, "ratings_final.parquet"))
users_final.to_parquet(os.path.join(DATA_DIR, "users_final.parquet"))
books.to_parquet(os.path.join(DATA_DIR, "books_final.parquet"))
# save the maps too in CSV
user_id_map_df.to_csv(os.path.join(DATA_DIR, "user_id_map.csv"), index=False)
book_id_map_df.to_csv(os.path.join(DATA_DIR, "book_id_map.csv"), index=False)

Let's read the datasets again to make sure everything is fine.

In [48]:
interactions = pd.read_parquet(os.path.join(DATA_DIR, "ratings_final.parquet"))
users_final = pd.read_parquet(os.path.join(DATA_DIR, "users_final.parquet"))
books = pd.read_parquet(os.path.join(DATA_DIR, "books_final.parquet"))
user_id_map_df = pd.read_csv(os.path.join(DATA_DIR, "user_id_map.csv"))
book_id_map_df = pd.read_csv(os.path.join(DATA_DIR, "book_id_map.csv"))

In [49]:
print(f"Shape of interactions: {interactions.shape}")
print(f"Shape of books: {books.shape}")
print(f"Shape of users: {users_final.shape}")
print(f"Shape of user_id_map: {user_id_map_df.shape}")
print(f"Shape of book_id_map: {book_id_map_df.shape}")

Shape of interactions: (104756, 3)
Shape of books: (22058, 13)
Shape of users: (31940, 35)
Shape of user_id_map: (31940, 2)
Shape of book_id_map: (22020, 2)
