In [None]:
import pandas as pd

# Configuration for dataset path, output path, and seed value
CONFIG = {
    "dataset_path": "IMDB Dataset.csv",
    "output_path": "../../../internal_classifier_data/sentiment/IMDB.json",
    "seed": 42
}

def load_and_preprocess_data(config):
    """
    Load the IMDB dataset, preprocess, and balance the number of positive and negative samples.
    
    Args:
        config (dict): A dictionary containing the dataset path, output path, and random seed for configuration.
    
    Returns:
        DataFrame: The preprocessed and balanced dataset.
    """
    # Load the dataset
    df_train = pd.read_csv(config["dataset_path"])
    df_train = df_train[["review", "sentiment"]]

    # Separate positive and negative reviews
    df_positive = df_train[df_train['sentiment'] == "positive"]
    df_negative = df_train[df_train['sentiment'] == "negative"]

    # Calculate the sample count for each category and select the smaller count
    min_count = min(len(df_positive), len(df_negative))

    # Randomly select 'min_count' samples from each dataframe
    df_positive_sampled = df_positive.sample(n=min_count, random_state=config["seed"])
    df_negative_sampled = df_negative.sample(n=min_count, random_state=config["seed"])

    # Merge samples and reshuffle
    df_balanced = pd.concat([df_positive_sampled, df_negative_sampled]).sample(frac=1, random_state=config["seed"])

    # Replace 'sentiment' column values to numeric types
    df_balanced['sentiment'] = df_balanced['sentiment'].replace({'positive': 1, 'negative': 0})

    # Rename columns to more clearly reflect content
    df_balanced = df_balanced.rename(columns={'review': 'text', 'sentiment': 'label'})

    return df_balanced

In [None]:
def save_to_json(df, config):
    """
    Save DataFrame to a JSON file.

    Args:
        df (DataFrame): The DataFrame to save.
        config (dict): A configuration dictionary containing the output path.
    """
    df.to_json(config["output_path"], orient='records', lines=True, force_ascii=False)

In [None]:
config = CONFIG
df_preprocessed = load_and_preprocess_data(config)
df_preprocessed

In [None]:
save_to_json(df_preprocessed, config)
print("Data processing complete, saved to JSON.")