# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.auto import tqdm
from scipy import sparse
import torch

import warnings
warnings.filterwarnings('ignore')

# Data

## Load data

In [2]:
DATA_DIR = os.path.join("..", "data", "final_dataset")

In [10]:
interactions = pd.read_parquet(os.path.join(DATA_DIR, 'ratings.parquet'))
books= pd.read_parquet(os.path.join(DATA_DIR, 'books_all.parquet'))
users = pd.read_parquet(os.path.join(DATA_DIR, 'users.parquet'))
interactions = interactions[interactions["isbn"].isin(books["isbn"])]
interactions = interactions.query("provided_rating!=0")
interactions.reset_index(drop=True, inplace=True)
print(f"Number of ratings: {len(interactions)}")
print(f"Number of unique users: {interactions['user_id'].nunique()}")
print(f"Number of books: {interactions['isbn'].nunique()}")
print(f"Shape of interactions: {interactions.shape}")
print(f"Shape of books: {books.shape}")
print(f"Shape of users: {users.shape}")
interactions.head()

Number of ratings: 104756
Number of unique users: 31940
Number of books: 22020
Shape of interactions: (104756, 3)
Shape of books: (215397, 18)
Shape of users: (278858, 3)


Unnamed: 0,user_id,isbn,provided_rating
0,17,891075275,6
1,17,553264990,5
2,26,449005615,9
3,39,671888587,7
4,69,1853260053,8


We will remove the extra users and books that are not in the interaction.

In [11]:
user_ids = interactions["user_id"].unique().tolist()
isbns = interactions["isbn"].unique().tolist()
users = users.query(f"user_id in {user_ids}")
books = books.query(f"isbn in {isbns}")
print(f"Shape of books: {books.shape}")
print(f"Shape of users: {users.shape}")

Shape of books: (22058, 18)
Shape of users: (31940, 3)


We have left with a very few books and users.

In [12]:
books.head()

Unnamed: 0,num_pages,star_rating_1,star_rating_2,star_rating_3,star_rating_4,star_rating_5,average_rating,total_ratings,total_reviews,isbn,publication_date,title,series,book_number_in_series,authors,categories,format,lang
1,870,12455,37005,211781,604283,1493113,4.5,2358637,29770,0439358078,2004-09-01,Harry Potter and the Order of the Phoenix,Harry Potter,5,[J.K. Rowling],"[Fantasy, Fantasy Books, Fantasy Books for Kids]",CD,en
3,352,11896,49353,288821,706082,1504505,4.42,2560657,244,0439554896,2003-11-01,Harry Potter and the Chamber of Secrets,Harry Potter,2,[J.K. Rowling],"[Fantasy, Fantasy Books, Fantasy Books for Kids]",CD,en
4,435,10128,24849,194848,630534,1749958,4.57,2610317,37093,043965548X,2004-05-01,Harry Potter and the Prisoner of Azkaban,Harry Potter,3,[J.K. Rowling],"[Fantasy, Fantasy Books, Fantasy Books for Kids]",CD,en
8,815,3443,7613,30030,75683,157499,4.37,274268,4119,0345453743,2002-04-30,The Ultimate Hitchhiker's Guide to the Galaxy,Hitchhiker's Guide to the Galaxy,5,[Douglas Adams],"[Contemporary Fiction, Science Fiction, Scienc...",Audio,en
11,55,249,985,3342,2409,1408,3.45,8393,503,0767915062,2002-12-03,Bill Bryson's African Diary,Standalone,1,[Bill Bryson],"[British & Irish History, Guidebooks, Travel W...",CD,en


In [13]:
users.head()

Unnamed: 0,user_id,location,age
16,17,"chesapeake, virginia, usa",
25,26,"bellevue, washington, usa",
38,39,"cary, north carolina, usa",
68,69,"vancouver, british columbia, canada",
77,78,"oakland, california, usa",18.0


## Preprocessing

As a part of preprocessing, we will do the following steps:
1. Create new columns, city, state and country from the location column.
2. Remove the location column.
3. Fill the age column with mean of the group of city, state and country. If some are remaining, use overall mean.
4. Select columns to be used from book details. (For more details, see the next section)

### Users Dataset

In [14]:
users["city"] = users["location"].str.split(", ").str[0]
users["state"] = users["location"].str.split(", ").str[1]
users["country"] = users["location"].str.split(", ").str[2]

In [15]:
groups = users.groupby(["city", "state", "country"]).groups
users_final = pd.DataFrame()
for group in tqdm(groups):
    temp_df = users.loc[groups[group]]
    if len(temp_df["age"].notnull()) == 0:
        users_final = pd.concat([users_final, temp_df], ignore_index=True)
    temp_df["age"].fillna(temp_df["age"].mean(), inplace=True)
    users_final = users_final.append(temp_df)


  0%|          | 0/10425 [00:00<?, ?it/s]

In [16]:
users_final["age"].isna().sum(), users["age"].isna().sum()

(3302, 12423)

In [17]:
users_final["age"].fillna(users_final["age"].mean(), inplace=True)
users_final["age"].isna().sum()

0

In [18]:
users_final = users_final[["user_id", "age", "city", "state", "country"]]
users_final["age"] = users_final["age"].astype(int)
users_final.sort_values(by="user_id", inplace=True)
users_final.reset_index(drop=True, inplace=True)
users_final.head()

Unnamed: 0,user_id,age,city,state,country
0,17,46,chesapeake,virginia,usa
1,26,38,bellevue,washington,usa
2,39,37,cary,north carolina,usa
3,69,34,vancouver,british columbia,canada
4,78,18,oakland,california,usa


### The Books Dataset

We will be keeping all the rating columns, the review column and `num_pages` as they are just numbers. We will extract the year from publication date and keep it as a separate column. For series, we will create a boolean column that will be 1 if the book has a series and 0 otherwise. For authors, categories and columns, we will use one hot encoding if feasible.

In [21]:
columns_to_take = [
    "num_pages",
    "star_rating_1",
    "star_rating_2",
    "star_rating_3",
    "star_rating_4",
    "star_rating_5",
    "average_rating",
    "total_ratings",
    "total_reviews",
    "isbn",
    "publication_date",
    "series",
    "book_number_in_series",
    "authors",
    "categories",
    "format",
    "lang",
]

In [22]:
books = books[columns_to_take]
books["publication_date"] = pd.to_datetime(books["publication_date"])
books["publication_year"] = books["publication_date"].dt.year
books.drop(columns=["publication_date"], inplace=True)
books.head().T

Unnamed: 0,1,3,4,8,11
num_pages,870,352,435,815,55
star_rating_1,12455,11896,10128,3443,249
star_rating_2,37005,49353,24849,7613,985
star_rating_3,211781,288821,194848,30030,3342
star_rating_4,604283,706082,630534,75683,2409
star_rating_5,1493113,1504505,1749958,157499,1408
average_rating,4.5,4.42,4.57,4.37,3.45
total_ratings,2358637,2560657,2610317,274268,8393
total_reviews,29770,244,37093,4119,503
isbn,0439358078,0439554896,043965548X,0345453743,0767915062


In [28]:
all_authors = set()
for authors in books["authors"].tolist():
    all_authors.update(authors)

len(all_authors)

9968

There are about 10k distinct authors. I don't think we can use one hot encoding for this.

In [29]:
all_categories = set()
for categories in books["categories"].tolist():
    all_categories.update(categories)

len(all_categories)

1560

Genre are all a lot. We will try to filter the top ones and use one hot encoding for them.

In [30]:
cat_counts = {}
for categories in books["categories"].tolist():
    for category in categories:
        if category in cat_counts:
            cat_counts[category] += 1
        else:
            cat_counts[category] = 1

In [34]:
cat_counts = {k: v for k, v in sorted(cat_counts.items(), key=lambda item: item[1], reverse=True)}

In [36]:
cat_counts_u = {k: v for k, v in cat_counts.items() if "books" not in k.lower()}

{'Contemporary Fiction': 5996,
 'Science Fiction': 2564,
 'Classic Books & Novels': 2143,
 'Crime': 1991,
 'Crime Fiction': 1991,
 'Thriller Books': 1949,
 'Thrillers': 1949,
 'Adult & Contemporary Romance': 1568,
 'Horror': 1336,
 'Historical Fiction': 1260,
 'Historical Romance': 1146,
 "Children's Fiction": 1100,
 'Romance': 1086,
 'Romance Books': 1086,
 'Adventure Books': 1063,
 'Biography: General': 993,
 'Fantasy': 900,
 'Fantasy Books': 900,
 'Short Story Books': 872,
 'Sagas': 836,
 'Funny Books & Stories': 677,
 'Espionage': 622,
 'Political & Legal': 582,
 'Classic Books for Children': 533,
 'Memoirs': 532,
 'Anthologies (non-poetry)': 529,
 'Literary Studies: General': 527,
 'Space Opera': 490,
 'Mind, Body, Spirit: Thought & Practice': 483,
 "Children's General Story Books": 482,
 'History Of The Americas': 466,
 'Poetry By Individual Poets': 451,
 'Family': 450,
 'Religious & Spiritual Fiction': 441,
 'Adventure Books for Kids': 436,
 'Biography: Historical, Political & M

## Smaller Dataset

In [5]:
num_ratings = df.groupby('isbn')['provided_rating'].count().sort_values(ascending=False)
most_rated_books = num_ratings.index[:10]
num_ratings = df.groupby('isbn')['provided_rating'].count().sort_values(ascending=False)
most_rated_books = num_ratings.index[:10]
num_ratings.head()

isbn
0316666343    707
0060928336    320
0671027360    269
067976402X    256
0786868716    242
Name: provided_rating, dtype: int64

In [6]:
df.groupby('user_id')['provided_rating'].count().sort_values(ascending=False)

user_id
11676     1593
98391      595
189835     371
76499      333
153662     322
          ... 
59675        1
157184       1
59685        1
59697        1
278854       1
Name: provided_rating, Length: 31940, dtype: int64

In [7]:
ratings = pd.DataFrame(df.groupby('isbn')['provided_rating'].mean())
ratings['num_ratings'] = pd.DataFrame(df.groupby('isbn')['provided_rating'].count())
ratings.head()

Unnamed: 0_level_0,provided_rating,num_ratings
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
2163578,5.0,1
2190915,9.5,2
2210479,6.0,1
2222469,8.0,1
2241358,8.0,1


In [8]:
min_ratings = 5
books_ = ratings.query(f"num_ratings > {min_ratings}").index
print(f"Number of books_ with more than {min_ratings} ratings: {len(books_)}")
print(f"Original number of books_: {df['isbn'].nunique()}")
print(f"Number of rows in the original dataset: {df.shape[0]}")
df_small = df[df['isbn'].isin(books_)]
unique_users = df_small['user_id'].nunique()
print(f"Number of rows in the new dataset: {df_small.shape[0]}")
print(f"Number of unique users in the new dataset: {unique_users}")

Number of books_ with more than 5 ratings: 3823
Original number of books_: 22020
Number of rows in the original dataset: 104756
Number of rows in the new dataset: 72190
Number of unique users in the new dataset: 25812


## Preprocessing

In [9]:
n_users = df_small.user_id.nunique()
n_items = df_small.isbn.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 25812
Num of Movies: 3823


In [10]:
user_id_map = dict(zip(df_small.user_id.unique(), list(range(n_users))))
book_id_map = dict(zip(df_small.isbn.unique(), list(range(n_items))))
user_id_map_df  = pd.DataFrame(
    {
        "user_id":user_id_map.keys(),
        "user_id_new": user_id_map.values(),
    }
)
book_id_map_df  = pd.DataFrame(
    {
        "isbn":book_id_map.keys(),
        "isbn_new": book_id_map.values(),
    }
)
df_small["user_id"] = df_small["user_id"].map(user_id_map)
df_small["isbn"] = df_small["isbn"].map(book_id_map)

In [28]:
df_small.head()

Unnamed: 0,user_id,isbn,provided_rating
0,0,0,6
1,0,1,5
2,1,2,9
3,2,3,7
4,3,4,8


In [11]:
def split_dataframe(df, holdout_fraction=0.1):
  """Splits a DataFrame into training and test sets.
  Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
  Returns:
    train: dataframe for training
    test: dataframe for testing
  """
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

train_df_, test_df_ = split_dataframe(df_small)

# Theory

Using neural networks instead of matrix factorization (MF) for recommendation systems has a number of advantages. See the notes section for detail. Here, we will be providing the model architecture that will be used for the recommendation system.

The model will consist of two steps:

## The Embedding Layer

This layer will take the user and item IDs in one-hot encoded form along with any other user and item feature and will pass it through a fully connected layer. The output of this layer will be the latent representation of the user and item. Let use denote $\mathbf{u}$ as the user and $\mathbf{v}$ as the item. Their dimensions will be $m+m_{uf}$ and $n+n_{if}$ respectively where $m$ and $n$ are the number of users and items and $m_{uf}$ and $n_{if}$ are the number of user and item features respectively. The output of the embedding layer will be $\mathbf{u} \in \mathbb{R}^d$ and $\mathbf{v} \in \mathbb{R}^d$ where $d$ is the dimension of the latent space.

We will have two different layers, one for users and the other of items. This is required because the number of users and items are different and we want to learn different embeddings for them. The user embedding layer will have $m+m_{uf}$ neurons and the item embedding layer will have $n+n_{if}$ neurons.

## CF Layers

CF layers, or collaborative filtering layers are made up of one or more layers of fully connected layers. The input to these layers will be the concatenation of the user and item latent representations. The output of the CF layers will be the predicted rating.

![](images/dl_01.png)

## Loss Function

The loss function can either be MSE or cross entropy. We will be experimenting with both.

## Data

We will be using all the positive data and a random sample of the negative data. The ratio of positive to negative data will be decided by a parameter.

# Model

In [27]:
train_df_.head()

Unnamed: 0,user_id,isbn,provided_rating
0,0,0,6
1,0,1,5
2,1,2,9
3,2,3,7
4,3,4,8
