In [1]:
import pandas as pd

# Configuration for dataset path, output path, and seed value
CONFIG = {
    "dataset_path": "jigsaw-toxic-comment-train.csv",
    "output_path": "../../../internal_classifier_data/toxic/jigsaw-toxic-comment.json",
    "seed": 42
}

def load_and_preprocess_data(config):
    """
    Load the Jigsaw Toxic Comment dataset, preprocess, and balance the number of toxic and non-toxic samples.
    
    Args:
        config (dict): A dictionary containing the dataset path, output path, and random seed for configuration.
    
    Returns:
        DataFrame: The preprocessed and balanced dataset.
    """
    # Load the dataset
    df_train = pd.read_csv(config["dataset_path"])
    df_train = df_train[["id", "comment_text", "toxic"]]

    # Separate toxic and non-toxic comments
    df_toxic = df_train[df_train['toxic'] == 1]
    df_nontoxic = df_train[df_train['toxic'] == 0]

    # Calculate the sample count for each category and select the smaller count
    min_count = min(len(df_toxic), len(df_nontoxic))

    # Randomly select 'min_count' samples from each dataframe
    df_toxic_sampled = df_toxic.sample(n=min_count, random_state=config["seed"])
    df_nontoxic_sampled = df_nontoxic.sample(n=min_count, random_state=config["seed"])

    # Merge samples and reshuffle
    df_balanced = pd.concat([df_toxic_sampled, df_nontoxic_sampled]).sample(frac=1, random_state=config["seed"])

    # Rename columns to more clearly reflect content and select relevant columns
    df_balanced = df_balanced.rename(columns={'comment_text': 'text', 'toxic': 'label'})
    df_balanced = df_balanced[["text", "label"]]

    return df_balanced

def save_to_json(df, config):
    """
    Save DataFrame to a JSON file in the specified orientation and encoding.

    Args:
        df (DataFrame): The DataFrame to save.
        config (dict): A configuration dictionary containing the output path.
    """
    df.to_json(config["output_path"], orient='records', lines=True, force_ascii=False)


In [2]:
config = CONFIG
df_preprocessed = load_and_preprocess_data(config)
df_preprocessed

Unnamed: 0,text,label
95914,"""\n\n:The Drawbridge newspaper\nA tag has been...",0
186689,""" \n\n == External link to photos of students ...",0
54438,Why don't you read about what I said about bei...,1
83283,David Duke\n\nWhy did you remove truth from th...,1
218632,in published dictionaries and,0
...,...,...
175275,""" \n\n """"He's a monkey with no mic skills, and...",1
218982,fuck my ass so hard and make me cum— hurt my...,1
132051,""" Talk \np.s. why have you copied my username ...",0
29228,"""Vandalism==\nI'm damn sick and tired of idiot...",1


In [3]:
save_to_json(df_preprocessed, config)
print("Data processing complete, saved to JSON.")

Data processing complete, saved to JSON.
