## Process and prepare EXIST data
    - For now, we are focusing just on the binary classification task
    - We are not leveraging the Learning With Disagreement (LWD) paradigm. Therefore, examples for which there is no majority voting are discarded
    - Since the test set is provided without gold labels, we define a portion of the train set to be the validation set and use the dev set as the test set 

In [None]:
import json

year = "2024"
train_file_path = f"../data/EXIST{year}_training.json"
dev_file_path = f"../data/EXIST{year}_dev.json"
train_gold_labels_task1_path = f"../data/golds/EXIST{year}_training_task1_gold_hard.json"
dev_gold_labels_task1_path = f"../data/golds/EXIST{year}_dev_task1_gold_hard.json"
train_gold_labels_task2_path = f"../data/golds/EXIST{year}_training_task2_gold_hard.json"
dev_gold_labels_task2_path = f"../data/golds/EXIST{year}_dev_task2_gold_hard.json"
train_gold_labels_task3_path = f"../data/golds/EXIST{year}_training_task3_gold_hard.json"
dev_gold_labels_task3_path = f"../data/golds/EXIST{year}_dev_task3_gold_hard.json"

with open(train_file_path, 'r', encoding='utf-8') as f:
    train_data = json.load(f)
with open(dev_file_path, 'r', encoding='utf-8') as f:
    dev_data = json.load(f)
with open(train_gold_labels_task1_path, 'r', encoding='utf-8') as f:
    train_gold_labels_task1 = json.load(f)
with open(dev_gold_labels_task1_path, 'r', encoding='utf-8') as f:
    dev_gold_labels_task1 = json.load(f)
with open(train_gold_labels_task2_path, 'r', encoding='utf-8') as f:
    train_gold_labels_task2 = json.load(f)
with open(dev_gold_labels_task2_path, 'r', encoding='utf-8') as f:
    dev_gold_labels_task2 = json.load(f)
with open(train_gold_labels_task3_path, 'r', encoding='utf-8') as f:
    train_gold_labels_task3 = json.load(f)
with open(dev_gold_labels_task3_path, 'r', encoding='utf-8') as f:
    dev_gold_labels_task3 = json.load(f)

def convert_gold_dict(gold_dict):
    converted_dict = {}
    for item in gold_dict:
        id = item['id']
        value = item['value']
        converted_dict[id] = value
    return converted_dict

train_gold_labels_task1 = convert_gold_dict(train_gold_labels_task1)
dev_gold_labels_task1 = convert_gold_dict(dev_gold_labels_task1)
train_gold_labels_task2 = convert_gold_dict(train_gold_labels_task2)
dev_gold_labels_task2 = convert_gold_dict(dev_gold_labels_task2)
train_gold_labels_task3 = convert_gold_dict(train_gold_labels_task3)
dev_gold_labels_task3 = convert_gold_dict(dev_gold_labels_task3)

In [None]:
print(train_gold_labels_task1)

In [None]:
print(len(train_data))
print(len(train_gold_labels_task1))
print(len(train_gold_labels_task2))
print(len(train_gold_labels_task3))
print(len(dev_data))
print(len(dev_gold_labels_task1))
print(len(dev_gold_labels_task2))
print(len(dev_gold_labels_task3))

In [None]:
train_common_ids = set(train_gold_labels_task1.keys()) & set(train_gold_labels_task2.keys()) & set(train_gold_labels_task3.keys())
dev_common_ids = set(dev_gold_labels_task1.keys()) & set(dev_gold_labels_task2.keys()) & set(dev_gold_labels_task3.keys())
print(len(train_common_ids))
print(len(dev_common_ids))

In [None]:
import pandas as pd

def json_to_df(data: dict):
    #df = pd.DataFrame(columns=['id', 'lang', 'text', 'label_sexist', 'label_category', 'label_vector'])
    df = pd.DataFrame(columns=['id', 'lang', 'text', 'label_sexist'])
    for k, entry in data.items():
        id = entry['id_EXIST']
        if id not in train_common_ids and id not in dev_common_ids:
            continue
        lang = entry['lang']
        text = entry['tweet']
        label_sexist = train_gold_labels_task1[id] if id in train_gold_labels_task1 else dev_gold_labels_task1[id]
        label_category = train_gold_labels_task2[id] if id in train_gold_labels_task2 else dev_gold_labels_task2[id]
        label_vector = train_gold_labels_task3[id] if id in train_gold_labels_task3 else dev_gold_labels_task3[id]
        if label_sexist not in ["YES", "NO"]:
            raise ValueError(f"Unexpected label_sexist value: {label_sexist} for id: {id}")
        label_sexist = "sexist" if label_sexist == "YES" else "not sexist"
        df = pd.concat([df, pd.DataFrame({'id': [id], 'lang': [lang], 'text': [text], 'label_sexist': [label_sexist], 'label_category': [label_category], 'label_vector': [label_vector]})], ignore_index=True)
        #df = pd.concat([df, pd.DataFrame({'id': [id], 'lang': [lang], 'text': [text], 'label_sexist': [label_sexist], 'label_category': [label_category], 'label_vector': [label_vector]})], ignore_index=True)
    return df

train_df = json_to_df(train_data)
dev_df = json_to_df(dev_data)

In [None]:
train_df.to_csv(f"../data/original_train.csv", index=False)
dev_df.to_csv(f"../data/original_dev.csv", index=False)

In [None]:
# Split training data into train and validation sets
from sklearn.model_selection import train_test_split

# Create a combined stratification column
train_df['stratify_col'] = train_df['lang'].astype(str) + '_' + train_df['label_sexist'].astype(str) + '_' + train_df['label_category'].astype(str) + '_' + train_df['label_vector'].str[0].astype(str)

# Split train_df into train and validation (80/20 split)
train_split_df, val_split_df = train_test_split(
    train_df, 
    test_size=0.2, 
    random_state=42, 
    stratify=train_df['stratify_col']  # Use combined column for stratification
)

# Remove the temporary stratify column
train_split_df = train_split_df.drop('stratify_col', axis=1)
val_split_df = val_split_df.drop('stratify_col', axis=1)

print(f"Original training set size: {len(train_df)}")
print(f"New training set size: {len(train_split_df)}")
print(f"Validation set size: {len(val_split_df)}")
print(f"Dev set (test) size: {len(dev_df)}")

# Check label distribution
print("\nLabel distribution in splits:")
print("Training split:", train_split_df['label_sexist'].value_counts().to_dict(), train_split_df['label_category'].value_counts().to_dict())
print("Validation split:", val_split_df['label_sexist'].value_counts().to_dict(), val_split_df['label_category'].value_counts().to_dict())
print("Dev set (test):", dev_df['label_sexist'].value_counts().to_dict(), dev_df['label_category'].value_counts().to_dict())

In [None]:
train_split_df['split'] = 'train'
val_split_df['split'] = 'dev'
dev_df['split'] = 'test'

aggregated_df = pd.concat([train_split_df, val_split_df, dev_df], ignore_index=True)
aggregated_df.to_csv(f"../data/aggregated_data.csv", index=False)
aggregated_df_en = aggregated_df[aggregated_df['lang'] == 'en']
aggregated_df_es = aggregated_df[aggregated_df['lang'] == 'es']
aggregated_df_en.to_csv(f"../data/aggregated_data_en.csv", index=False)
aggregated_df_es.to_csv(f"../data/aggregated_data_es.csv", index=False)