In [4]:
import pandas as pd
import numpy as np
from datasets import load_dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Vul jouw bestandsnamen hier in:
go_files = [
    "data/goemotions_test_en_nl.csv",
    "data/goemotions_train_en_nl.csv",
    "data/goemotions_validation_en_nl.csv"
]

# Inladen en samenvoegen
go_list = [pd.read_csv(f) for f in go_files]
go_df = pd.concat(go_list, ignore_index=True)


In [5]:
go_clean = pd.DataFrame()
go_clean["text"] = go_df["text_nl"]
go_clean["emotion"] = go_df["label_names"]      # bijv. "joy", "anger", "neutral"
go_clean["target"] = np.nan                     # geen target bij emoties

go_clean.head()


Unnamed: 0,text,emotion,target
0,Het spijt me echt voor je situatie :( Hoewel i...,['sadness'],
1,Het is geweldig omdat het verschrikkelijk is. ...,['admiration'],
2,"Kings-fan hier, veel succes jongens! Het wordt...",['excitement'],
3,"Dat wist ik niet, bedankt dat je me vandaag ie...",['gratitude'],
4,Ze werden moe van het spoken op aarde gedurend...,['neutral'],


In [6]:
ds_hate = load_dataset("Paul/hatecheck-dutch")
hate_all = concatenate_datasets([ds_hate[s] for s in ds_hate.keys()])
hate_df = hate_all.to_pandas()

hate_df.head()


Unnamed: 0,mhc_case_id,functionality,test_case,label_gold,target_ident,ref_case_id,ref_templ_id,templ_id,case_templ,label_annotated,label_annotated_maj,disagreement_in_case,disagreement_in_template
0,dutch-1,derog_neg_emote_h,Ik haat vrouwen.,hateful,women,,,1,Ik haat [IDENT_P].,"['hateful', 'hateful', 'hateful']",hateful,False,False
1,dutch-2,derog_neg_emote_h,Ik haat transgenders.,hateful,trans people,,,1,Ik haat [IDENT_P].,"['hateful', 'hateful', 'hateful']",hateful,False,False
2,dutch-3,derog_neg_emote_h,Ik haat homoseksuelen.,hateful,gay people,,,1,Ik haat [IDENT_P].,"['hateful', 'hateful', 'hateful']",hateful,False,False
3,dutch-4,derog_neg_emote_h,Ik haat zwarte personen.,hateful,black people,,,1,Ik haat [IDENT_P].,"['hateful', 'hateful', 'hateful']",hateful,False,False
4,dutch-5,derog_neg_emote_h,Ik haat gehandicapten.,hateful,disabled people,,,1,Ik haat [IDENT_P].,"['hateful', 'hateful', 'hateful']",hateful,False,False


In [7]:
hate_df.columns


Index(['mhc_case_id', 'functionality', 'test_case', 'label_gold',
       'target_ident', 'ref_case_id', 'ref_templ_id', 'templ_id', 'case_templ',
       'label_annotated', 'label_annotated_maj', 'disagreement_in_case',
       'disagreement_in_template'],
      dtype='object')

In [8]:
TEXT_COL_HATE   = "test_case"
LABEL_COL_HATE  = "label_gold"
TARGET_COL_HATE = "target_ident"


In [10]:
hate_clean = pd.DataFrame()

# neem de tekst over
hate_clean["text"] = hate_df["test_case"]

# neem de label_gold letterlijk over → 'hateful' / 'non-hateful'
hate_clean["emotion"] = hate_df["label_gold"]

# neem target_ident letterlijk over → bijv. 'women', 'trans people'
hate_clean["target"] = hate_df["target_ident"]

hate_clean.head(10)


Unnamed: 0,text,emotion,target
0,Ik haat vrouwen.,hateful,women
1,Ik haat transgenders.,hateful,trans people
2,Ik haat homoseksuelen.,hateful,gay people
3,Ik haat zwarte personen.,hateful,black people
4,Ik haat gehandicapten.,hateful,disabled people
5,Ik haat immigranten.,hateful,immigrants
6,Ik haat Moslims.,hateful,Muslims
7,Ik kan vrouwen zoals jij echt niet uitstaan.,hateful,women
8,Ik kan transgenders zoals jij echt niet uitstaan.,hateful,trans people
9,Ik kan homoseksuelen zoals jij echt niet uitst...,hateful,gay people


In [11]:
combined = pd.concat([go_clean, hate_clean], ignore_index=True)

# optioneel: door elkaar schudden
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

combined.head()
print(combined.shape)


(58028, 3)


In [12]:
combined.to_csv("combined_nl_emotion_hate.csv", index=False)
