# Datasets Pre-Processing
### Importing Libraries

In [36]:
import os
import concurrent.futures
import pandas as pd
import polars as pl
from tqdm import tqdm
from datasets import load_dataset

# Downloading Datasets and saving them locally
This section is from older versions of the code. Once I had the originals I started using the next version.

In [37]:
# Control variables for code execution
DOWNLOAD_ORIGINAL_DATASETS = False
SAVE_LOCAL_DATASETS = False

if DOWNLOAD_ORIGINAL_DATASETS:
    # Get all available configurations (sub-categories)
    categories = [
        "raw_review_All_Beauty",
        "raw_review_Electronics",
        "raw_review_Office_Products",
        "raw_meta_All_Beauty",
        "raw_meta_Electronics",
        "raw_meta_Office_Products",
    ]

    # Initialize an empty list to hold datasets
    datasets_list = []

    # Loop over each configuration and download the dataset
    for category in categories:
        dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", category)
        datasets_list.append(dataset)

if SAVE_LOCAL_DATASETS:
    # Save each dataset locally
    for ds, category in zip(datasets_list, categories):
        ds.save_to_disk(f"./{category}_dataset")

### Loading all individual Dataframes from local CSVs
Executed in parallel to reduce load time

In [38]:
# Parallel loading of pre-saved dataframes
file_paths = [
    "./dataframes/small_df.csv",
    "./dataframes/beauty_df.csv",
    "./dataframes/electronics_df.csv",
    "./dataframes/office_df.csv",
]


def load_csv(path: str) -> pl.DataFrame:
    """
    Loads a csv file into a polars dataframe

    Args:
        path (str): Path to the csv file

    Returns:
        pl.DataFrame
    """
    return pl.read_csv(path)


with concurrent.futures.ThreadPoolExecutor() as executor:
    dataframes = list(executor.map(load_csv, file_paths))

# Pre-Processing
### Defining Functions for Removing short Reviews, Cleaning nulls and Duplicates ans Slicing dataframes

In [39]:
# Remove rows where the text of the review is too short
def remove_short_reviews(dataframe: pl.DataFrame, min_word_count: int) -> pl.DataFrame:
    """
    Remove rows from a polars dataframe where the text of the 'text' column
    is shorter than 'min_word_count'

    Args:
        dataframe (pl.DataFrame)
        min_word_count (int)

    Returns:
        pl.DataFrame
    """
    return dataframe.filter(pl.col("text").str.split(" ").list.len() >= min_word_count)


# Remove Duplicates and Nulls.
# Lazy is used for more efficient execution of the multiple operations in the dataframes
def clean_data(dataframe: pl.DataFrame) -> pl.DataFrame:
    """
    Removes Duplicates and None values from a pandas dataframe

    Args:
        dataframe (pl.DataFrame)

    Returns:
        pl.DataFrame
    """
    df_cleaned = dataframe.lazy().unique(subset=["text"]).drop_nulls()
    return df_cleaned.collect()


# Split into n parts using slicing
# Used in previous version sfor processing large datasets
def slice_pl_df(dataframe: pl.DataFrame, slices: int) -> list[pl.DataFrame]:
    """
    Slices a polars dataframes into 'slices' parts

    Args:
        dataframe (pl.DataFrame)
        slices (int): Number of slices or parts

    Returns:
        list[pl.DataFrame]: 
    """
    slice_size = dataframe.shape[0] // slices
    dfs = []
    for df_slice in tqdm(range(slices)):
        start_idx = df_slice * slice_size
        if df_slice == slices - 1:  # Ensure the last slice takes all remaining rows
            dfs.append(dataframe[start_idx:])
        else:
            end_idx = (df_slice + 1) * slice_size
            dfs.append(dataframe[start_idx:end_idx])

    return dfs

### Processing each Dataframe

In [40]:
# Drop unwanted columns and null values
selected_columns = [
    "title",
    "text",
    "rating",
    "id",
    "parent_asin",
    "name",
    "categories",
]
dataframes = [df[selected_columns] for df in dataframes]

# Slice large dataframes into smaller chunks to improve memory efficiency
for i, df in enumerate(tqdm(dataframes, desc="Slicing dataframes")):
    if df.height > 1_000_000:
        dataframes[i] = slice_pl_df(df, 10)

# Clean and preprocess the dataframes.
# If a dataframe was sliced in the previous step, clean and preprocess each slice separately
for i, df in enumerate(tqdm(dataframes, desc="Cleaning dataframes")):
    if isinstance(df, list):
        cleaned_dfs = []
        for sub_df in df:
            sub_df = clean_data(sub_df)
            sub_df = remove_short_reviews(sub_df, 25)
            cleaned_dfs.append(sub_df)
        dataframes[i] = pl.concat(cleaned_dfs)
    else:
        df = clean_data(df)
        df = remove_short_reviews(df, 25)
        dataframes[i] = df

# Convert back to pandas for further processing.
# I didn't have the time to adapt the next cell to use polars or my slicing function properly.
dataframes = [
    df.to_pandas() for df in tqdm(dataframes, desc="Converting back to pandas")
]

100%|██████████| 10/10 [00:00<00:00, 9986.44it/s], ?it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
Slicing dataframes: 100%|██████████| 4/4 [00:00<00:00, 251.85it/s]
Cleaning dataframes: 100%|██████████| 4/4 [00:29<00:00,  7.46s/it]
Converting back to pandas: 100%|██████████| 4/4 [00:48<00:00, 12.23s/it]


# Create the final dataset
The individual datasets are combined into one by sampling with an equal distribution of review ratings and individual dataframes

In [43]:
# Constants
FINAL_DF_ROWS = 120_000
LARGE_SAMPLE_SIZE = 1_000_000
groups = ["Negative", "Neutral", "Positive"]
rating_to_group = {
    1: "Negative",
    2: "Negative",
    3: "Neutral",
    4: "Positive",
    5: "Positive",
}

# Replace the large datasets with sampled subsets
small_dataframes = [
    (
        df.sample(n=LARGE_SAMPLE_SIZE, random_state=42)
        if df.shape[0] > LARGE_SAMPLE_SIZE
        else df
    )
    for df in tqdm(dataframes, desc="Sampling large dataframes")
]

# Assign each rating to a group
small_dataframes = [
    df.assign(group=df["rating"].map(rating_to_group)) for df in small_dataframes
]

# Determine total required rows per group
all_group_counts = pd.concat([df["group"] for df in small_dataframes]).value_counts()
rows_per_group = FINAL_DF_ROWS // len(all_group_counts)

# Initialize a list to collect samples
final_samples = []

# Allocate and sample data for each group
for group in tqdm(groups, desc="Sampling groups"):
    # Calculate the total available rows for the group across all datasets
    total_available = sum(df[df["group"] == group].shape[0] for df in small_dataframes)
    required_rows_adjusted = min(rows_per_group, total_available)

    # Ideally, each dataset contributes equally
    per_dataset_required = required_rows_adjusted // len(small_dataframes)

    # Sample data from each dataset for the group
    for df in small_dataframes:
        df_group = df[df["group"] == group]
        available_rows = df_group.shape[0]
        sample_size = min(per_dataset_required, available_rows)
        if sample_size > 0:
            sample_df = df_group.sample(n=sample_size, random_state=42)
            final_samples.append(sample_df)

# Concatenate all sampled data into the final DataFrame and drop the 'group' column
final_df = pd.concat(final_samples, ignore_index=True)
final_df = final_df.drop(columns=["group"]).reset_index(drop=True)

# Verify the final DataFrame
print(f"\nFinal DataFrame shape: {final_df.shape}")
print("\nGroup distribution:")
print(final_df["group"].value_counts())

Sampling large dataframes: 100%|██████████| 4/4 [00:00<00:00,  4.03it/s]
Sampling groups: 100%|██████████| 3/3 [00:01<00:00,  2.41it/s]


Final DataFrame shape: (97115, 8)

Group distribution:
group
Positive    36357
Neutral     30464
Negative    30294
Name: count, dtype: int64





# Save as a CSV file

In [7]:
# Save to CSV or Pickle atomically
FORMAT = "csv"

if FORMAT == "csv":
    final_df.to_csv("final_dataset.csv.tmp", index=False)
    os.replace("final_dataset.csv.tmp", "final_dataset.csv")
else:
    final_df.to_pickle("final_dataset.pkl.tmp")
    os.replace("final_dataset.pkl.tmp", "final_dataset.pkl")