# Imports

In [1]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset
from datasets import Dataset
from tqdm.auto import tqdm
import re
import glob
tqdm.pandas()

This notebook will create the book dataset only. We will merge books from [1] and [2].

The folder `2_final` has parquet files having the books from [2] with the matching ID from [1] and [3]. We will use them to create a final book dataset.

The idea is to use the author and genre information from [1] and the rest of the information from [2] for books. These will then be merged with [3] to give us a final book dataset along with the ratings of the books for different users.

## Loading Data and Preprocessing

### Books from [1]

Some preprocessing needs to be done before we can merge the data. This section will deal with it.

In [3]:
books1 = pd.read_parquet("1_2/all_books_with_id.parquet")
books1 = Dataset.from_pandas(books1)

In [4]:
books1[1]

{'authors': '[2, 3]',
 'categories': '[235, 3386]',
 'format': 1.0,
 'isbn10': '184454737X',
 'lang': 'en',
 'publication-date': '2009-03-13 00:00:00',
 'title': 'Underbelly : The Gangland War',
 'authors_': 'John Silvester: Andrew Rule',
 'ID': 1}

Let's remove the following columns, as they will not be used later:
- authors_
- isbn10 ([3] will be used)
- title (will be used from [2])
- publication-date (will be used from [2])

In [5]:
columns_to_remove = ['isbn10', 'publication-date', 'title', 'authors_']
books1 = books1.remove_columns(columns_to_remove)
books1

Dataset({
    features: ['authors', 'categories', 'format', 'lang', 'ID'],
    num_rows: 1109383
})

In [6]:
books1[1213]

{'authors': '[631]',
 'categories': '[2978, 2452, 3385]',
 'format': 9.0,
 'lang': 'en',
 'ID': 1213}

The `ID` column will be used to match the books. It is a simple integer, so it can be used as the index.

### Books from [2]

Books from [2] are divided into many files. Let's use a sample file to work with.

In [20]:
books2 = pd.read_parquet(f"2_final/book400k-500k.parquet")
books2 = Dataset.from_pandas(books2)
books2

Dataset({
    features: ['id', 'title', 'authors', 'pages', 'pages.1', 'language', 'star_1', 'star_2', 'star_3', 'star_4', 'star_5', 'rating', 'total_ratings', 'total_reviews', 'isbn', 'publication_date', 'title_', 'series', 'book_num', 'book_id', 'score', 'text', 'book_id_3', 'score_3'],
    num_rows: 8169
})

In [21]:
books2[5]

{'id': 400054,
 'title': 'One Day in the Desert',
 'authors': 'Jean Craighead George',
 'pages': 64,
 'pages.1': 64,
 'language': None,
 'star_1': 2,
 'star_2': 4,
 'star_3': 24,
 'star_4': 26,
 'star_5': 17,
 'rating': 3.71,
 'total_ratings': 73,
 'total_reviews': 14,
 'isbn': '0064420388',
 'publication_date': '1996-04-12',
 'title_': 'One Day in the Desert',
 'series': 'Standalone',
 'book_num': 1,
 'book_id': 1003093,
 'score': 7.20892333984375,
 'text': 'One Day in the Desert by Jean Craighead George',
 'book_id_3': -1,
 'score_3': 16.608840942382812}

We will be dropping the following columns:
- pages.1 (Duplicate of pages)
- authors (To be used from [1])
- language (most of them are Null, we will use [1] to get the language)
- title (the columns title_ and series will be used instead)
- text (redundant column)
- isbn (will be used from [3])
- book_id_3
- score_3

In [22]:
columns_to_remove = ["title", "pages.1", "language", "text", "authors", "isbn", "book_id_3", "score_3"]
books2 = books2.remove_columns(columns_to_remove)
books2

Dataset({
    features: ['id', 'pages', 'star_1', 'star_2', 'star_3', 'star_4', 'star_5', 'rating', 'total_ratings', 'total_reviews', 'publication_date', 'title_', 'series', 'book_num', 'book_id', 'score'],
    num_rows: 8169
})

We will also need to rename some column name:

In [23]:
rename_map = {
    "title_": "title",
    "book_id": "book_id_1",
    "score": "score_1",
}

for key, value in rename_map.items():
    books2 = books2.rename_column(key, value)

In [24]:
books2

Dataset({
    features: ['id', 'pages', 'star_1', 'star_2', 'star_3', 'star_4', 'star_5', 'rating', 'total_ratings', 'total_reviews', 'publication_date', 'title', 'series', 'book_num', 'book_id_1', 'score_1'],
    num_rows: 8169
})

We will create a function for all these steps as we will need to repeat this for many datasets.

In [25]:
def load_and_preprocess_2(file_dir):
    books2 = pd.read_parquet(file_dir)
    books2 = Dataset.from_pandas(books2)

    columns_to_remove = ["title", "pages.1", "language", "text", "authors", "book_id_3", "score_3"]
    try:
        books2 = books2.remove_columns(columns_to_remove)
    except ValueError:
        columns_to_remove.pop(1)
        books2 = books2.remove_columns(columns_to_remove)
    rename_map = {
        "title_": "title",
        "book_id": "book_id_1",
        "score": "score_1",
    }

    for key, value in rename_map.items():
        books2 = books2.rename_column(key, value)
    return books2

In [26]:
books2 = load_and_preprocess_2("2_final/book400k-500k.parquet")
books2[10]

{'id': 400074,
 'pages': 6,
 'star_1': 301,
 'star_2': 1213,
 'star_3': 5878,
 'star_4': 10626,
 'star_5': 7925,
 'rating': 3.95,
 'total_ratings': 25943,
 'total_reviews': 16,
 'isbn': '1572704438',
 'publication_date': '2008-01-22',
 'title': 'The Thin Man',
 'series': 'Standalone',
 'book_num': 1,
 'book_id_1': 364146,
 'score_1': 0.0}

## Merging the Dataframes

The books from [2] has the IDs for the matching book in [1] and [3]. If no match is found, we have imputed -1. Also, this ID is nothing but the index of the book in the dataframe [1] and [3], if not shuffled. Using these, the next section will deal with merging the all three dataframes into one.

We will start by filtering out only those rows in [2] where a match has been found for both [1] and [3]:

In [27]:
def filter_function(row):
    id_1 = row["book_id_1"]
    if id_1 == -1:
        return False
    return True

In [28]:
books2_filtered = books2.filter(filter_function)
print(f"Number of books before filtering: {len(books2)}")
print(f"Number of books after filtering: {len(books2_filtered)}")

Filter:   0%|          | 0/8169 [00:00<?, ? examples/s]

Number of books before filtering: 8169
Number of books after filtering: 8169


We can see that far more books are matching this way compared to when we were using all the three datasets to create a match.

In [29]:
def match_book(row):
    book1_id = row["book_id_1"]
    info_from_book1 = books1[book1_id]
    info_from_book1.pop("ID")
    final_info = {**row, **info_from_book1}
    return final_info

In [30]:
books2_matched = books2_filtered.map(match_book)

Map:   0%|          | 0/8169 [00:00<?, ? examples/s]

In [31]:
pd.DataFrame(books2_matched[:5]).T

Unnamed: 0,0,1,2,3,4
id,400006,400019,400026,400034,400052
pages,2,304,137,242,64
star_1,2,5,0,5,1
star_2,2,16,0,3,7
star_3,5,106,0,28,37
star_4,8,128,0,48,31
star_5,1,86,1,91,31
rating,3.22,3.8,5.0,4.24,3.79
total_ratings,18,341,1,175,107
total_reviews,0,54,0,0,2


This is working as intended. Let's make a final function to be used for each dataframe from [2].

In [32]:
to_save_dir = os.path.join("merged_dataset", "books_only")

In [33]:
def match_one_book_df(file_dir):
    books2 = load_and_preprocess_2(file_dir)
    books2_filtered = books2.filter(filter_function)
    print(f"Number of books before filtering: {len(books2)}")
    print(f"Number of books after filtering: {len(books2_filtered)}")
    books2_matched = books2_filtered.map(match_book)
    file_name = file_dir.split(os.path.sep)[-1]
    file_name = os.path.join(to_save_dir, file_name)
    books2_matched.to_parquet(file_name)
    print(f"Saved to {file_name}")

In [34]:
all_books_dir = glob.glob("2_final/book*.parquet")
all_books_dir.sort()
len(all_books_dir)

22

In [35]:
start_id = 0
for current_id in range(start_id, len(all_books_dir)):
    print(f"CURRENTLY WORKING ON: {current_id}")
    match_one_book_df(all_books_dir[current_id])
    print("----"*10)
    print("----"*10)

CURRENTLY WORKING ON: 0


Filter:   0%|          | 0/16146 [00:00<?, ? examples/s]

Number of books before filtering: 16146
Number of books after filtering: 16146


Map:   0%|          | 0/16146 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1-100k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 1


Filter:   0%|          | 0/6331 [00:00<?, ? examples/s]

Number of books before filtering: 6331
Number of books after filtering: 6331


Map:   0%|          | 0/6331 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1000k-1100k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 2


Filter:   0%|          | 0/11662 [00:00<?, ? examples/s]

Number of books before filtering: 11662
Number of books after filtering: 11662


Map:   0%|          | 0/11662 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book100k-200k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 3


Filter:   0%|          | 0/6467 [00:00<?, ? examples/s]

Number of books before filtering: 6467
Number of books after filtering: 6467


Map:   0%|          | 0/6467 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1100k-1200k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 4


Filter:   0%|          | 0/6136 [00:00<?, ? examples/s]

Number of books before filtering: 6136
Number of books after filtering: 6136


Map:   0%|          | 0/6136 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1200k-1300k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 5


Filter:   0%|          | 0/5125 [00:00<?, ? examples/s]

Number of books before filtering: 5125
Number of books after filtering: 5125


Map:   0%|          | 0/5125 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1300k-1400k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 6


Filter:   0%|          | 0/4700 [00:00<?, ? examples/s]

Number of books before filtering: 4700
Number of books after filtering: 4700


Map:   0%|          | 0/4700 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1400k-1500k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 7


Filter:   0%|          | 0/3869 [00:00<?, ? examples/s]

Number of books before filtering: 3869
Number of books after filtering: 3869


Map:   0%|          | 0/3869 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1500k-1600k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 8


Filter:   0%|          | 0/3725 [00:00<?, ? examples/s]

Number of books before filtering: 3725
Number of books after filtering: 3725


Map:   0%|          | 0/3725 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1600k-1700k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 9


Filter:   0%|          | 0/3801 [00:00<?, ? examples/s]

Number of books before filtering: 3801
Number of books after filtering: 3801


Map:   0%|          | 0/3801 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1700k-1800k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 10


Filter:   0%|          | 0/4418 [00:00<?, ? examples/s]

Number of books before filtering: 4418
Number of books after filtering: 4418


Map:   0%|          | 0/4418 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1800k-1900k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 11


Filter:   0%|          | 0/4783 [00:00<?, ? examples/s]

Number of books before filtering: 4783
Number of books after filtering: 4783


Map:   0%|          | 0/4783 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book1900k-2000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 12


Filter:   0%|          | 0/38540 [00:00<?, ? examples/s]

Number of books before filtering: 38540
Number of books after filtering: 38540


Map:   0%|          | 0/38540 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/39 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book2000k-3000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 13


Filter:   0%|          | 0/10157 [00:00<?, ? examples/s]

Number of books before filtering: 10157
Number of books after filtering: 10157


Map:   0%|          | 0/10157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book200k-300k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 14


Filter:   0%|          | 0/21446 [00:00<?, ? examples/s]

Number of books before filtering: 21446
Number of books after filtering: 21446


Map:   0%|          | 0/21446 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book3000k-4000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 15


Filter:   0%|          | 0/18425 [00:00<?, ? examples/s]

Number of books before filtering: 18425
Number of books after filtering: 18425


Map:   0%|          | 0/18425 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book4000k-5000k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 16


Filter:   0%|          | 0/8169 [00:00<?, ? examples/s]

Number of books before filtering: 8169
Number of books after filtering: 8169


Map:   0%|          | 0/8169 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book400k-500k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 17


Filter:   0%|          | 0/8889 [00:00<?, ? examples/s]

Number of books before filtering: 8889
Number of books after filtering: 8889


Map:   0%|          | 0/8889 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book500k-600k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 18


Filter:   0%|          | 0/7973 [00:00<?, ? examples/s]

Number of books before filtering: 7973
Number of books after filtering: 7973


Map:   0%|          | 0/7973 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book600k-700k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 19


Filter:   0%|          | 0/7926 [00:00<?, ? examples/s]

Number of books before filtering: 7926
Number of books after filtering: 7926


Map:   0%|          | 0/7926 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book700k-800k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 20


Filter:   0%|          | 0/9384 [00:00<?, ? examples/s]

Number of books before filtering: 9384
Number of books after filtering: 9384


Map:   0%|          | 0/9384 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book800k-900k.parquet
----------------------------------------
----------------------------------------
CURRENTLY WORKING ON: 21


Filter:   0%|          | 0/7325 [00:00<?, ? examples/s]

Number of books before filtering: 7325
Number of books after filtering: 7325


Map:   0%|          | 0/7325 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Saved to merged_dataset\books_only\book900k-1000k.parquet
----------------------------------------
----------------------------------------


## Creating a Single Dataframe

We will be creating a single big dataframe for all the books. This may later be cleaned and post-processed to create a final dataset.

In [2]:
all_books_dir = glob.glob("merged_dataset/books_only/book*.parquet")
all_books_dir.sort()
len(all_books_dir)

22

In [6]:
rename_map = {
    "pages": "num_pages",
    "star_1": "star_rating_1",
    "star_2": "star_rating_2",
    "star_3": "star_rating_3",
    "star_4": "star_rating_4",
    "star_5": "star_rating_5",
    "rating": "average_rating",
    "book_num": "book_number_in_series",
}
columns_to_drop = [
        'id', 'book_id_1', 'score_1'
    ]

In [7]:
def read_and_preprocess(file_dir):
    df = pd.read_parquet(file_dir)
    df = df.drop(columns_to_drop, axis=1)
    df["publication_date"] = df["publication_date"].apply(lambda x: x.split(" ")[0])
    df = df.rename(columns=rename_map)
    return df

In [11]:
dfs = []
for file_path in tqdm(all_books_dir, desc='Reading and preprocessing books'):
    df = read_and_preprocess(file_path)
    print(df.shape)
    dfs.append(df)
final_df = pd.concat(dfs, ignore_index=True)

Reading and preprocessing books:   0%|          | 0/22 [00:00<?, ?it/s]

(16146, 18)
(6331, 18)
(11662, 18)
(6467, 18)
(6136, 18)
(5125, 18)
(4700, 18)
(3869, 18)
(3725, 18)
(3801, 18)
(4418, 18)
(4783, 18)
(38540, 18)
(10157, 18)
(21446, 18)
(18425, 18)
(8169, 18)
(8889, 18)
(7973, 18)
(7926, 18)
(9384, 18)
(7325, 18)


In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215397 entries, 0 to 215396
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   num_pages              215397 non-null  int64  
 1   star_rating_1          215397 non-null  int64  
 2   star_rating_2          215397 non-null  int64  
 3   star_rating_3          215397 non-null  int64  
 4   star_rating_4          215397 non-null  int64  
 5   star_rating_5          215397 non-null  int64  
 6   average_rating         215397 non-null  float64
 7   total_ratings          215397 non-null  int64  
 8   total_reviews          215397 non-null  int64  
 9   isbn                   214196 non-null  object 
 10  publication_date       215397 non-null  object 
 11  title                  215397 non-null  object 
 12  series                 215397 non-null  object 
 13  book_number_in_series  215397 non-null  int64  
 14  authors                215397 non-nu

In this case, we get over 210k books!

In [13]:
final_df.to_parquet('merged_dataset/books_only/all_books.parquet')