# Data Exploration

## 0 Global settings

In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
from typing import List, Dict

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"json version: {json.__version__}")
print(f"re version: {re.__version__}")
print("Setup Complete")


System version: 3.11.5 (tags/v3.11.5:cce6ba9, Aug 24 2023, 14:38:34) [MSC v.1936 64 bit (AMD64)]
Pandas version: 2.2.2
Numpy version: 1.26.4
json version: 2.0.9
re version: 2.2.1
Setup Complete


## 1 Load Datasets

### 1.1 Specify Dataset Directory

In [2]:
raw_path = '../data/raw'
interim_path = '../data/interim'
processed_path = '../data/processed'


### 1.2.1 Load dataset of books/genres/authors/pg_catalog

In [3]:
def load_data(file_path: str) -> List[Dict]:
    """
    Loads data from a gzipped JSON file.

    Args:
        file_path (str): The path to the gzipped JSON file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a data point.
    """
    try:
        with gzip.open(file_path, 'rt') as file:
            data = [json.loads(line) for line in file]
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []
    except (json.JSONDecodeError, UnicodeDecodeError):
        print(f"Error: Unable to decode data in '{file_path}'.")
        return []
    return data


In [4]:
# Extract JSON files data
books = load_data(os.path.join(raw_path, 'goodreads_books_children.json.gz'))
genres = load_data(os.path.join(raw_path, 'goodreads_book_genres_initial.json.gz'))
authors = load_data(os.path.join(raw_path, 'goodreads_book_authors.json.gz'))


In [5]:
# Load data into a DataFrame
books_df = pd.json_normalize(
    books, "authors", ["book_id", "title", 'publication_year']
)
genres_df = pd.DataFrame(genres)
authors_df = pd.DataFrame(authors)


In [6]:
# Extract CSV data with public domain books from project gutenberg
pgbooks_df = pd.read_csv(os.path.join(raw_path,'pg_catalog.csv'), low_memory=False)


## 2 Preprocessing Data

### 2.1 Project Gutenberg Catalog

In [7]:
def clean_data_pg(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the input DataFrame by filtering rows based on specific conditions and dropping unnecessary columns.

    Args:
        df (pd.DataFrame): The input DataFrame to be cleaned.

    Returns:
        pd.DataFrame: The cleaned DataFrame.

    This function performs the following operations:
    1. Filters rows where the 'Subjects' column contains 'Children's stories', the 'Type' column is 'Text',
        and the 'Language' column is 'en'.
    2. Converts the 'Title' column to string data type.
    3. Drops the 'Text#', 'Type', 'Issued', 'Subjects', 'LoCC', 'Bookshelves', and 'Language' columns.
    """
    
    # Filter rows based on columns: 'Subjects', 'Type', 'Language'
    filter_mask = (
        df['Subjects'].str.contains(
            "Children's stories", regex=False, na=False)
        & (df['Type'] == "Text")
        & (df['Language'] == "en")
    )
    df = df[filter_mask]

    # Change column type to string for column: 'Title'
    df = df.astype({'Title': 'string'})

    # Drop unnecessary columns
    columns_to_drop = ['Text#', 'Type', 'Issued',
                        'Subjects', 'LoCC', 'Bookshelves', 'Language']
    df = df.drop(columns=columns_to_drop)

    return df


In [8]:
# Clean and transform pg_catalog data
public_domain_books = clean_data_pg(pgbooks_df.copy())
public_domain_books.head()


Unnamed: 0,Title,Authors
10,Alice's Adventures in Wonderland,"Carroll, Lewis, 1832-1898"
11,Through the Looking-Glass,"Carroll, Lewis, 1832-1898"
56,Aladdin and the Magic Lamp,Unknown
127,The Arabian Nights Entertainments,"Lang, Andrew, 1844-1912"
162,Flower Fables,"Alcott, Louisa May, 1832-1888"


In [9]:
def save_to_csv(df: pd.DataFrame, path: str, filename: str) -> None:
    """
    Saves a DataFrame to a CSV file.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        path (str): The directory to save the file in.
        filename (str): The name of the file.

    Returns:
        None
    """
    # Check if the directory exists, if not, create it
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Path '{path}' created.")

    # Create the full file path
    file_path = os.path.join(path, filename)

    # Save the DataFrame to a CSV file
    df.to_csv(file_path, index=False)

    print(f"{filename} file saved in {file_path}")


In [21]:
# Load to intermediate data path
save_to_csv(public_domain_books, interim_path, 'public_domain_books.csv')


public_domain_books.csv file saved in ../data/interim\public_domain_books.csv


### 2.2 Books Features

In [12]:
def clean_books_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the books data DataFrame by performing various data cleaning operations.

    Args:
        df (pd.DataFrame): The input DataFrame containing the books data.
        public_domain_books_df (pd.DataFrame): A DataFrame containing the titles of public domain books.

    Returns:
        pd.DataFrame: The cleaned DataFrame.

    This function performs the following operations:
    1. Drops the 'role' column.
    2. Replaces empty strings with None.
    3. Drops rows with missing values in the 'publication_year' column.
    4. Drops duplicate rows based on the 'title' column.
    5. Filters for books in the public domain based on the 'Title' column in public_domain_books_df.
    6. Resets the index.
    7. Converts the 'book_id' column to string and the 'publication_year' column to int32.
    8. Reorders the columns to ['book_id', 'author_id', 'title', 'publication_year'].
    """

    # Drop column: 'role'
    df = df.drop(columns=['role'])

    # Replace all instances of "" with "None"
    df = df.replace("", pd.NA)

    # Drop rows with missing data in column: 'publication_year'
    df = df.dropna(subset=['publication_year'])

    # Drop duplicate rows in column: 'title'
    df = df.drop_duplicates(subset=['title'])

    # Filter for books in the public domain
    public_domain_titles = set(public_domain_books['Title'])
    df = df[df['title'].isin(public_domain_titles)]

    # Reset Index
    df.reset_index(drop=True, inplace=True)

    # Convert 'publication_year' to int and 'book_id' to string
    df = df.astype({'book_id': 'string', 'publication_year': 'int32'})

    # Reorder columns
    df = df[['book_id', 'author_id', 'title', 'publication_year']]
    
    return df


In [13]:
# Clean books data
books_df_clean = clean_books_data(books_df.copy())
books_df_clean.head()


Unnamed: 0,book_id,author_id,title,publication_year
0,581169,3843,The Wind in the Willows,2009
1,11881251,3843,Dream Days,2010
2,20256651,64709,'Twas the Night Before Christmas,2014
3,2223427,17538,A Traveler in Time,1964
4,2429311,1082741,Indian Tales,2007


In [14]:
# Load to intermediate data path
save_to_csv(books_df_clean, interim_path, 'books_curated.csv')


books_curated.csv file saved in ../data/interim\books_curated.csv


### 2.3 Books Genres 

In [15]:
def clean_genres_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the 'genres' column of a DataFrame containing book data.

    Args:
        df (pd.DataFrame): The input DataFrame containing the 'book_id' and 'genres' columns.

    Returns:
        pd.DataFrame: The cleaned DataFrame with the 'genres' column converted to a comma-separated string.

    This function performs the following operations:
    1. Converts the 'book_id' column to string type.
    2. Filters the DataFrame to include only book IDs in the public domain.
    3. Converts the 'genres' column from a list of genres to a comma-separated string of genres.
    4. Resets the index of the resulting DataFrame.
    """

    # Change column type to string for columns: 'book_id'
    df['book_id'] = df['book_id'].astype(str)

    # Filter for books_ids in the public domain
    books_ids = set(books_df_clean['book_id'])
    df = df[df['book_id'].isin(books_ids)]

    # Convert genres column to list of dictionaries
    df['genres'] = df['genres'].apply(lambda x: list(x.keys()))

    # Convert the 'genres' column from a list of genres to a comma-separated string of genres
    df['genres'] = df['genres'].apply(
        lambda x: ', '.join(x) if isinstance(x, list) else x)

    # Reset index
    df.reset_index(drop=True, inplace=True)

    return df


In [16]:
# Clean genres data
genres_df_clean = clean_genres_data(genres_df.copy())
genres_df_clean.head()


Unnamed: 0,book_id,genres
0,581169,"fiction, children, fantasy, paranormal, young-..."
1,11881251,"fiction, fantasy, paranormal, children, young-..."
2,20256651,"children, poetry, fiction, fantasy, paranormal..."
3,2223427,"fiction, history, historical fiction, biograph..."
4,2429311,"children, history, historical fiction, biograp..."


In [17]:
# Load to intermediate data path
save_to_csv(genres_df_clean, interim_path, 'genres_curated.csv')


genres_curated.csv file saved in ../data/interim\genres_curated.csv


### 2.4 Books Authors

In [18]:
def clean_authors_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the authors data by dropping unnecessary columns, converting data types,
    and filtering for authors present in the books data.

    Args:
        df (pd.DataFrame): The authors data DataFrame.
        books_df (pd.DataFrame): The books data DataFrame.

    Returns:
        pd.DataFrame: The cleaned authors data DataFrame.
    """
    
    # Drop unnecessary columns
    columns_to_drop = ['average_rating',
                'text_reviews_count', 'ratings_count']
    df = df.drop(columns=columns_to_drop)
    
    # Change column type to string for columns: 'author_id', 'name'
    df = df.astype({'author_id': 'string', 'name': 'string'})
    
    # Filter for authors_ids in the public domain
    authors_ids = set(books_df_clean['author_id'].astype(str))
    df = df[df['author_id'].isin(authors_ids)]

    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df

In [19]:
# Clean authors data
authors_df_clean = clean_authors_data(authors_df.copy())
authors_df_clean.head()


Unnamed: 0,author_id,name
0,3041852,Alfred J. Church
1,5411,Cynthia Rylant
2,11593,Beatrix Potter
3,8164,Lewis Carroll
4,3420,Elizabeth Enright


In [20]:
# Load to intermediate data path
save_to_csv(authors_df_clean,interim_path, 'authors_curated.csv')


authors_curated.csv file saved in ../data/interim\authors_curated.csv
