In [1]:
import pandas as pd
import numpy as np
import os
from datasets import load_dataset
from datasets import Dataset
from tqdm.auto import tqdm
import re
import glob
tqdm.pandas()

This notebook will create the final dataset using what we have done till now. Some preprocessing may also be done.


# Books


In [2]:
all_books = pd.read_parquet("merged_dataset/all_books.parquet")

In [3]:
all_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60812 entries, 0 to 60811
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_pages              60812 non-null  int64  
 1   star_rating_1          60812 non-null  int64  
 2   star_rating_2          60812 non-null  int64  
 3   star_rating_3          60812 non-null  int64  
 4   star_rating_4          60812 non-null  int64  
 5   star_rating_5          60812 non-null  int64  
 6   average_rating         60812 non-null  float64
 7   total_ratings          60812 non-null  int64  
 8   total_reviews          60812 non-null  int64  
 9   isbn                   60812 non-null  object 
 10  publication_date       60812 non-null  object 
 11  title                  60812 non-null  object 
 12  series                 60812 non-null  object 
 13  book_number_in_series  60812 non-null  int64  
 14  authors                60812 non-null  object 
 15  ca

## Add Authors


In [4]:
authors = pd.read_csv("1_clean/authors.csv")
authors.dropna(inplace=True)
authors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 654019 entries, 0 to 654020
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   author_id    654019 non-null  int64 
 1   author_name  654019 non-null  object
dtypes: int64(1), object(1)
memory usage: 15.0+ MB


In [5]:
authors.head()

Unnamed: 0,author_id,author_name
0,1,Mike Coburn
1,2,John Silvester
2,3,Andrew Rule
3,4,Julia Quinn
4,5,Andrew Hunter Murray


We can use indexing with `i-1` to get the author name with the given author id.


In [6]:
def get_author_name(author_ids):
    try:
        author_ids = eval(author_ids)
    except:
        return ['Unknown']
    author_ids = [int(id_) for id_ in author_ids]
    names = []
    for id_ in author_ids:
        try:
            names.append(authors.loc[id_ - 1]['author_name'])
        except:
            names.append('Unknown')
    return names

In [7]:
all_books["authors"] = all_books["authors"].progress_apply(get_author_name)

  0%|          | 0/60812 [00:00<?, ?it/s]

## Add Categories


In [8]:
categories = pd.read_csv("1_clean/categories.csv")
categories.dropna(inplace=True)
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2775 entries, 0 to 2774
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   category_id    2775 non-null   int64 
 1   category_name  2775 non-null   object
dtypes: int64(1), object(1)
memory usage: 43.5+ KB


In [9]:
def get_category_name(category_ids):
    try:
        category_ids = eval(category_ids)
    except:
        return ['Unknown']
    category_ids = [int(id_) for id_ in category_ids]
    names = []
    for id_ in category_ids:
        try:
            names.append(categories.loc[categories["category_id"] == id_, "category_name"].values[0])
        except Exception as e:
            names.append('Unknown')
    return names

In [10]:
all_books["categories"] = all_books["categories"].progress_apply(get_category_name)

  0%|          | 0/60812 [00:00<?, ?it/s]

In [11]:
all_books.head(3).T

Unnamed: 0,0,1,2
num_pages,870,309,352
star_rating_1,12455,108202,11896
star_rating_2,37005,130310,49353
star_rating_3,211781,567458,288821
star_rating_4,604283,1513191,706082
star_rating_5,1493113,4268227,1504505
average_rating,4.5,4.47,4.42
total_ratings,2358637,6587388,2560657
total_reviews,29770,75911,244
isbn,043935806X,1594130000,0439554896


## Add Format


In [12]:
formats = pd.read_csv("1_clean/formats.csv")
formats.dropna(inplace=True)
formats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   format_id    49 non-null     int64 
 1   format_name  49 non-null     object
dtypes: int64(1), object(1)
memory usage: 912.0+ bytes


In [13]:
formats.query(f"format_id == 1")["format_name"].values[0]

'Paperback'

In [14]:
def get_format_name(format_id):
    if pd.isna(format_id):
        return "Unknown"
    format_id = int(format_id)
    f = formats.query(f"format_id == {format_id}")["format_name"].values[0]
    return f

In [15]:
all_books["format"] = all_books["format"].progress_apply(get_format_name)

  0%|          | 0/60812 [00:00<?, ?it/s]

In [17]:
all_books.head(3).T

Unnamed: 0,0,1,2
num_pages,870,309,352
star_rating_1,12455,108202,11896
star_rating_2,37005,130310,49353
star_rating_3,211781,567458,288821
star_rating_4,604283,1513191,706082
star_rating_5,1493113,4268227,1504505
average_rating,4.5,4.47,4.42
total_ratings,2358637,6587388,2560657
total_reviews,29770,75911,244
isbn,043935806X,1594130000,0439554896


In [23]:
all_books.rename(
    columns = {
        "lang": "language",
    },
    inplace = True
)

In [27]:
correct_order = [
    "title",
    "authors",
    "categories",
    "language",
    "format",
    "num_pages",
    "star_rating_1",
    "star_rating_2",
    "star_rating_3",
    "star_rating_4",
    "star_rating_5",
    "average_rating",
    "total_ratings",
    "total_reviews",
    "isbn",
    "publication_date",
    "series",
    "book_number_in_series",
]
assert len(correct_order) == len(set(correct_order))
assert len(correct_order) == len(all_books.columns)

In [28]:
all_books = all_books[correct_order]

In [31]:
all_books.to_parquet('final_dataset/books.parquet')

# Users


In [19]:
users = pd.read_csv("3_clean/Users.csv")
users.head()

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


No changes needed.


In [32]:
users.to_parquet('final_dataset/users.parquet', index=False)

# Ratings


In [20]:
ratings = pd.read_csv("3_clean/Ratings.csv")
ratings.head()

Unnamed: 0,user_id,isbn,provided_rating
0,2,195153448,0
1,7,34542252,0
2,8,771025661,0
3,8,1881320189,7
4,8,1575663937,6


Again, no changes required.

In [33]:
ratings.to_parquet('final_dataset/ratings.parquet', index=False)

## Books Only

In [37]:
books = pd.read_parquet("merged_dataset/books_only/all_books.parquet")

In [38]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215397 entries, 0 to 215396
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   num_pages              215397 non-null  int64  
 1   star_rating_1          215397 non-null  int64  
 2   star_rating_2          215397 non-null  int64  
 3   star_rating_3          215397 non-null  int64  
 4   star_rating_4          215397 non-null  int64  
 5   star_rating_5          215397 non-null  int64  
 6   average_rating         215397 non-null  float64
 7   total_ratings          215397 non-null  int64  
 8   total_reviews          215397 non-null  int64  
 9   isbn                   214196 non-null  object 
 10  publication_date       215397 non-null  object 
 11  title                  215397 non-null  object 
 12  series                 215397 non-null  object 
 13  book_number_in_series  215397 non-null  int64  
 14  authors                215397 non-nu

In [39]:
books["authors"] = books["authors"].progress_apply(get_author_name)

  0%|          | 0/215397 [00:00<?, ?it/s]

In [41]:
books["categories"] = books["categories"].progress_apply(get_category_name)

  0%|          | 0/215397 [00:00<?, ?it/s]

In [44]:
books["format"] = books["format"].progress_apply(get_format_name)

  0%|          | 0/215397 [00:00<?, ?it/s]

In [48]:
books.to_parquet('final_dataset/books_all.parquet')