# Data Transform

## 0 Global settings

In [1]:
import os
import sys
import pandas as pd

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print("Setup Complete")


System version: 3.11.5 (tags/v3.11.5:cce6ba9, Aug 24 2023, 14:38:34) [MSC v.1936 64 bit (AMD64)]
Pandas version: 2.2.2
Setup Complete


## 1 Load Datasets

### 1.1 Specify Dataset Directory

In [2]:
raw_path = '../data/raw'
interim_path = '../data/interim'
processed_path = '../data/processed'


### 1.2.1 Load dataset of books/genres/authors

In [3]:
# Extract CSV curated data
books_df = pd.read_csv(os.path.join(interim_path,'books_curated.csv'))
genres_df = pd.read_csv(os.path.join(interim_path,'genres_curated.csv'))
authors_df = pd.read_csv(os.path.join(interim_path,'authors_curated.csv'))


## 2 Item Features Dataset

In [4]:
def transform_items_data(books: pd.DataFrame, authors: pd.DataFrame, genres: pd.DataFrame) -> pd.DataFrame:
    """
    Merge and transform data from books, authors, and genres DataFrames.

    Args:
        books (pandas.DataFrame): DataFrame containing book information.
        authors (pandas.DataFrame): DataFrame containing author information.
        genres (pandas.DataFrame): DataFrame containing genre information.

    Returns:
        pandas.DataFrame: Merged and transformed DataFrame containing book, author, and genre information.
    """

    # Merge books and authors DataFrames
    df = pd.merge(books, authors, on='author_id')

    # Merge with genres DataFrame
    df = pd.merge(df, genres, on='book_id')

    # Reorder columns
    df = df.loc[:, ['book_id', 'title', 'name', 'genres', 'publication_year']]

    # Rename column 'name' to 'author'
    df = df.rename(columns={'name': 'author'})

    # Replace ', ' with '|' in 'genres' column
    df['genres'] = df['genres'].str.replace(', ', '|',)

    return df


In [5]:
# Create a item features dataset
item_feature_df = transform_items_data(books_df, authors_df, genres_df)
item_feature_df.head()


Unnamed: 0,book_id,title,author,genres,publication_year
0,581169,The Wind in the Willows,Kenneth Grahame,fiction|children|fantasy|paranormal|young-adult,2009
1,65832,Anne of Green Gables,L.M. Montgomery,fiction|young-adult|children|history|historica...,1984
2,6987114,Mary Louise and the Liberty Girls,Edith Van Dyne,mystery|thriller|crime|fiction|children,2009
3,852724,Heidi,Johanna Spyri,children|fiction|young-adult|history|historica...,1998
4,15983158,The Little Lame Prince,Dinah Maria Mulock Craik,fantasy|paranormal|fiction|children|young-adult,1874


In [6]:
def save_to_csv(df: pd.DataFrame, path: str, filename: str) -> None:
    """
    Saves a DataFrame to a CSV file.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        path (str): The directory to save the file in.
        filename (str): The name of the file.

    Returns:
        None
    """
    # Check if the directory exists, if not, create it
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Path '{path}' created.")

    # Create the full file path
    file_path = os.path.join(path, filename)

    # Save the DataFrame to a CSV file
    df.to_csv(file_path, index=False)

    print(f"{filename} file saved in {file_path}")

In [7]:
# Load to processed data path
save_to_csv(item_feature_df, processed_path, 'item-features.csv')


item-features.csv file saved in ../data/processed\item-features.csv


## 3 User-Item-Ratings Dataset

In [8]:
def transform_iteractions_data(path: str, chunk_size: int, books: pd.DataFrame) -> pd.DataFrame:
    """
    Processes a JSON file containing book interactions in chunks and returns a concatenated DataFrame.

    Args:
        path (str): The path to the JSON file.
        chunk_size (int): The size of each chunk to read.
        book_ids (list): A list of book IDs (integers) to filter the data by.

    Returns:
        pd.DataFrame: A DataFrame containing the processed book interactions.
    """

    try:
        # Initialize a counter to track the number of chunks processed
        counter = 0

        # Extract the list of book IDs from the books DataFrame
        books_ids = books["book_id"].tolist()

        # Read the file in chunks and filter directly during reading
        chunks = []
        for chunk in pd.read_json(path, lines=True, chunksize=chunk_size):
            chunk = chunk[["user_id", "book_id", "rating"]]
            chunk = chunk[chunk["book_id"].isin(books_ids)]

            # Increments the counter and shows the number of chunks processed
            chunks.append(chunk)
            counter += 1
            print(f"Chunks processed: {counter}")

        # Concatenate the processed chunks into a single DataFrame
        df = pd.concat(chunks, ignore_index=True)

        # Drop any duplicate rows
        df.drop_duplicates(inplace=True)
        print("Done")

        return df

    except FileNotFoundError as e:
        raise e


In [9]:
# Process user-item-ratings datasets
path_data = os.path.join(raw_path, 'goodreads_interactions_children.json.gz')
interactions_df = transform_iteractions_data(path_data, chunk_size=1000000, books=books_df)


Chunks processed: 1
Chunks processed: 2
Chunks processed: 3
Chunks processed: 4
Chunks processed: 5
Chunks processed: 6
Chunks processed: 7
Chunks processed: 8
Chunks processed: 9
Chunks processed: 10
Chunks processed: 11


In [10]:
# Load to processed data path
save_to_csv(interactions_df, processed_path, 'user-item-ratings.csv')


user-item-ratings.csv file saved in /content/drive/MyDrive/Data/processed/user-item-ratings.csv
