# Data Processing
In this notebook we load the datasets provided and we output a version ready to work with splitted in training and test set

In [1]:
import pandas as pd
import pathlib
from algorithms import *

DATASET_DIR = pathlib.Path('../data')
pd.options.plotting.backend = "plotly"

## Dataset Loading
In this section we load the dataset which is divided into training and dev set. We also load the gold labels that are provided in separate files

In [2]:
tr_df = pd.read_json(DATASET_DIR / 'training/EXIST2024_training.json').T.reset_index(drop=True)
dev_df = pd.read_json(DATASET_DIR / 'dev/EXIST2024_dev.json').T.reset_index(drop=True)
tr_df = tr_df.rename({'id_EXIST':'id'}, axis=1)
dev_df = dev_df.rename({'id_EXIST':'id'}, axis=1)
tr_df['id'] = tr_df['id'].astype('Int64')
dev_df['id'] = dev_df['id'].astype('Int64')

df = pd.concat([dev_df, tr_df], ignore_index=True, sort=False)

tr_gold_hard = pd.read_json(DATASET_DIR / 'evaluation/golds/EXIST2024_training_task3_gold_hard.json')
tr_gold_soft = pd.read_json(DATASET_DIR / 'evaluation/golds/EXIST2024_training_task3_gold_soft.json')
dev_gold_hard = pd.read_json(DATASET_DIR / 'evaluation/golds/EXIST2024_dev_task3_gold_hard.json')
dev_gold_soft = pd.read_json(DATASET_DIR / 'evaluation/golds/EXIST2024_dev_task3_gold_soft.json')

df_gold_hard = pd.concat([tr_gold_hard, dev_gold_hard], ignore_index=True, sort=False)
df_gold_soft = pd.concat([tr_gold_soft, dev_gold_soft], ignore_index=True, sort=False)

df_gold_hard = df_gold_hard.rename({'value':'gold_hard'}, axis=1).drop('test_case', axis=1)
df_gold_soft = df_gold_soft.rename({'value':'gold_soft'}, axis=1).drop('test_case', axis=1)

## Merge and cleaning
We merge the dataset, for each sample we concatenate the hard and soft gold labels. We then drop the irrelevant columns

In [3]:
df = df.merge(df_gold_hard, how='left', on='id')
df = df.merge(df_gold_soft, how='left', on='id')

df['number_annotators'].unique()

array([6], dtype=object)

Removal of superfluous labels

In [4]:
df = df.drop(['labels_task2', 'labels_task1', 'labels_task3', 'annotators', 'number_annotators', 'gender_annotators', 'age_annotators', 'ethnicities_annotators', 'study_levels_annotators', 'countries_annotators'], axis=1)
df.sample()

Unnamed: 0,id,lang,tweet,split,gold_hard,gold_soft
3488,102451,es,"@joseantoniokast M치s que misoginia, comentario...",TRAIN_ES,,"{'NO': 0.5, 'IDEOLOGICAL-INEQUALITY': 0.333333..."


In [5]:
df['gold_soft'].values

array([{'NO': 0.5, 'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.5, 'SEXUAL-VIOLENCE': 0.16666666666666602, 'IDEOLOGICAL-INEQUALITY': 0.0, 'STEREOTYPING-DOMINANCE': 0.0, 'OBJECTIFICATION': 0.0},
       {'IDEOLOGICAL-INEQUALITY': 0.33333333333333304, 'STEREOTYPING-DOMINANCE': 0.33333333333333304, 'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.5, 'NO': 0.16666666666666602, 'OBJECTIFICATION': 0.16666666666666602, 'SEXUAL-VIOLENCE': 0.0},
       {'NO': 1.0, 'IDEOLOGICAL-INEQUALITY': 0.0, 'STEREOTYPING-DOMINANCE': 0.0, 'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.0, 'SEXUAL-VIOLENCE': 0.0, 'OBJECTIFICATION': 0.0},
       ...,
       {'NO': 0.33333333333333304, 'OBJECTIFICATION': 0.16666666666666602, 'SEXUAL-VIOLENCE': 0.33333333333333304, 'STEREOTYPING-DOMINANCE': 0.16666666666666602, 'IDEOLOGICAL-INEQUALITY': 0.0, 'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.0},
       {'OBJECTIFICATION': 0.6666666666666661, 'SEXUAL-VIOLENCE': 0.5, 'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.5, 'IDEOLOGICAL-INEQUALITY': 0.16666666666666602, 'STEREOTYPING-DOMINAN

In [6]:
# Creating a dataframe with one column for each key in the gold_soft list
sum = None
for el in df['gold_soft'].values:
    if sum is None:
        sum = {}
        for key in el:
            sum[key] = [el[key]]
    else:
        for key in el:
            sum[key].append(el[key])
new_col = pd.DataFrame(sum)
new_col.sample()

Unnamed: 0,NO,MISOGYNY-NON-SEXUAL-VIOLENCE,SEXUAL-VIOLENCE,IDEOLOGICAL-INEQUALITY,STEREOTYPING-DOMINANCE,OBJECTIFICATION
6005,0.5,0.166667,0.333333,0.166667,0.0,0.0


In [7]:
# Concatenating the new columns to the original dataframe
df = pd.concat([df, new_col], axis=1)
df.sample()

Unnamed: 0,id,lang,tweet,split,gold_hard,gold_soft,NO,MISOGYNY-NON-SEXUAL-VIOLENCE,SEXUAL-VIOLENCE,IDEOLOGICAL-INEQUALITY,STEREOTYPING-DOMINANCE,OBJECTIFICATION
2177,101140,es,#SCJN - TESIS - Derecho a la informaci칩n. No p...,TRAIN_ES,[NO],"{'NO': 0.833333333333333, 'SEXUAL-VIOLENCE': 0...",0.833333,0.0,0.166667,0.0,0.0,0.0


In [8]:
df = df.drop('gold_soft', axis = 1)
df = df.rename({'gold_hard':'hard_label', 'lang':'language'}, axis=1)
df.sample()

Unnamed: 0,id,language,tweet,split,hard_label,NO,MISOGYNY-NON-SEXUAL-VIOLENCE,SEXUAL-VIOLENCE,IDEOLOGICAL-INEQUALITY,STEREOTYPING-DOMINANCE,OBJECTIFICATION
789,400241,en,"Ladies,don't let anyone body shame you in any ...",DEV_EN,[NO],0.666667,0.0,0.0,0.0,0.0,0.333333


## Training test splitting
We save the obtained dataset in the file merged_processed_dataset.csv.
Then we perform the train/test splitting using 80% for training set and 20% for the test set

In [9]:
df.to_csv(DATASET_DIR / 'merged_dataset.csv', sep=';', encoding='utf-8', index=False)

In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [11]:
train.to_csv(DATASET_DIR / 'training_split.csv', sep=';', encoding='utf-8', index=False)
test.to_csv(DATASET_DIR / 'test_split.csv', sep=';', encoding='utf-8', index=False) 

## Pre-Processing
Processing the tweets and save the new data

In [12]:
test = pd.read_csv(DATASET_DIR / 'test_split.csv', sep=';', encoding='utf-8')
train = pd.read_csv(DATASET_DIR / 'training_split.csv', sep=';', encoding='utf-8')
challenge_test = pd.read_csv(DATASET_DIR / 'real_test.csv', sep=';', encoding='utf-8')
merged = pd.read_csv(DATASET_DIR / 'merged_dataset.csv', sep=';', encoding='utf-8')

Applying the process_tweet function to each dataset

In [13]:
train['processed_tweet'] = train['tweet'].apply(process_tweet)
test['processed_tweet'] = test['tweet'].apply(process_tweet)
challenge_test['processed_tweet'] = challenge_test['tweet'].apply(process_tweet)
merged['processed_tweet'] = merged['tweet'].apply(process_tweet)

Saving the datasets to csv files

In [14]:
train.to_csv(DATASET_DIR / 'training_split_proc.csv', sep=';', encoding='utf-8', index=False)
test.to_csv(DATASET_DIR / 'test_split_proc.csv', sep=';', encoding='utf-8', index=False) 
challenge_test.to_csv(DATASET_DIR / 'real_test_proc.csv', sep=';', encoding='utf-8', index=False) 
merged.to_csv(DATASET_DIR / 'merged_dataset_proc.csv', sep=';', encoding='utf-8', index=False) 