In [7]:
import os
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

### ToxiGen

In [8]:
root_dir1 = 'original/ToxiGen/demonstrations'
root_dir2 = 'original/ToxiGen/annotated'

df_toxigen = []

for f_name in os.listdir(root_dir1):
    if f_name.startswith('hate'):
        f = open(f"{root_dir1}/{f_name}", 'r')
        texts = [text for text in f.read().split('\n')]
        # print("hate", len(texts))
        df_toxigen += [[text, 1.0] for text in texts]
    elif f_name.startswith('neutral'): 
        f = open(f"{root_dir1}/{f_name}", 'r')
        texts = [text for text in f.read().split('\n')]
        # print("neutral", len(texts))
        df_toxigen += [[text, 0.0] for text in texts]

print(f'Number of examples taked from ToxiGen-Demonstrations: {len(df_toxigen)}')

pq_train = pq.read_table(f'{root_dir2}/train-00000-of-00001.parquet')
pq_test = pq.read_table(f'{root_dir2}/test-00000-of-00001.parquet')
df_train = pq_train.to_pandas()
df_test = pq_test.to_pandas()

df_train = df_train.loc[:, ['text', 'toxicity_human']]
df_test = df_test.loc[:, ['text', 'toxicity_human']]
min_val = min(np.min(df_train['toxicity_human']), np.min(df_test['toxicity_human']))
max_val = max(np.max(df_train['toxicity_human']), np.max(df_test['toxicity_human']))
df_train['toxicity_human'] = np.round((df_train['toxicity_human'] - min_val) / (max_val - min_val), 2)
df_test['toxicity_human'] = np.round((df_test['toxicity_human'] - min_val) / (max_val - min_val), 2)

df_train = pd.DataFrame(df_toxigen + df_train.values.tolist(), columns=['text', 'label'])
df_test = df_test.rename(columns={'toxicity_human': 'label'})

df_train = df_train.drop_duplicates().reset_index(drop=True)
df_test = df_test.drop_duplicates().reset_index(drop=True)

print(f"Overall dimensions of the ToxiGen dataset:")
print(f"\t Train: {df_train.shape}")
print(f"\t Test: {df_test.shape}")
print(f"\t Unique values in the 'label' column: {sorted(list(set(list(df_train['label'].values) + list(df_test['label'].values))))}")

df_train.to_csv('processed/ToxiGen/train.csv', index=False)
df_test.to_csv('processed/ToxiGen/test.csv', index=False)

Number of examples taked from ToxiGen-Demonstrations: 654
Overall dimensions of the ToxiGen dataset:
	 Train: (9546, 2)
	 Test: (940, 2)
	 Unique values in the 'label' column: [0.0, 0.08, 0.17, 0.25, 0.33, 0.42, 0.5, 0.58, 0.67, 0.75, 0.83, 0.92, 1.0]


### SBIC

In [9]:
def preprocess_sbic(df: pd.DataFrame):
    df = df.rename(columns={'post': 'text', 'offensiveYN': 'label'})
    df['label'] = np.round(df['label'], 2)
    df = df.drop_duplicates().reset_index(drop=True)
    return df

df_train = preprocess_sbic(pd.read_csv('original/SBIC/SBIC.v2.agg.trn.csv')[['post', 'offensiveYN']])
df_dev = preprocess_sbic(pd.read_csv('original/SBIC/SBIC.v2.agg.dev.csv')[['post', 'offensiveYN']])
df_test = preprocess_sbic(pd.read_csv('original/SBIC/SBIC.v2.agg.tst.csv')[['post', 'offensiveYN']])

print(f"Overall dimensions of the SBIC dataset:")
print(f"\t Train: {df_train.shape}")
print(f"\t Validation: {df_dev.shape}")
print(f"\t Test: {df_test.shape}")
print(f"\t Unique values in the 'label' column: {sorted(list(set(list(df_train['label'].values) + list(df_dev['label'].values) + list(df_test['label'].values))))}")

df_train.to_csv('processed/SBIC/train.csv', index=False)
df_dev.to_csv('processed/SBIC/validation.csv', index=False)
df_test.to_csv('processed/SBIC/test.csv', index=False)

Overall dimensions of the SBIC dataset:
	 Train: (35424, 2)
	 Validation: (4666, 2)
	 Test: (4691, 2)
	 Unique values in the 'label' column: [0.0, 0.08, 0.1, 0.12, 0.17, 0.2, 0.25, 0.3, 0.33, 0.38, 0.4, 0.42, 0.43, 0.5, 0.57, 0.58, 0.6, 0.62, 0.64, 0.67, 0.7, 0.71, 0.72, 0.73, 0.75, 0.77, 0.78, 0.79, 0.8, 0.81, 0.83, 0.86, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 1.0]


### DHate

In [10]:
def preprocess_dhate(df: pd.DataFrame):
    df['label'] = df['label'].replace({'hate': 1.0, 'nothate': 0.0})
    return df

df = pd.read_csv('original/DHate/Dynamically Generated Hate Dataset v0.2.3.csv')[['text', 'label', 'split']]
df = preprocess_dhate(df)

df_train = df[df['split'] == 'train']
df_dev = df[df['split'] == 'dev']
df_test = df[df['split'] == 'test']

df_train.drop(columns=['split'], axis=1)
df_dev.drop(columns=['split'], axis=1)
df_test.drop(columns=['split'], axis=1)

print(f"Overall dimensions of the DHate dataset:")
print(f"\t Train: {df_train.shape}")
print(f"\t Validation: {df_dev.shape}")
print(f"\t Test: {df_test.shape}")
print(f"\t Unique values in the 'label' column: {sorted(list(set(list(df_train['label'].values) + list(df_dev['label'].values) + list(df_test['label'].values))))}")

df_train.to_csv('processed/DHate/train.csv', index=False)
df_dev.to_csv('processed/DHate/validation.csv', index=False)
df_test.to_csv('processed/DHate/test.csv', index=False)

  df['label'] = df['label'].replace({'hate': 1.0, 'nothate': 0.0})


Overall dimensions of the DHate dataset:
	 Train: (32924, 3)
	 Validation: (4100, 3)
	 Test: (4120, 3)
	 Unique values in the 'label' column: [0.0, 1.0]
