# Preparing Go-Emotions Dataset

I will use dataset with reddit comments and corresponding emotions from google - https://github.com/google-research/google-research/tree/master/goemotions

### Imports

In [53]:
!pip install iterative-stratification


Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [54]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import glob
import requests
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

### Create folders

In [6]:
CURRENT_DIR = os.getcwd()

PROJECT_ROOT = os.path.abspath(os.path.join(CURRENT_DIR, "..", ".."))

FULL_DATA_DIR = os.path.join(PROJECT_ROOT, "data", "full_dataset")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

os.makedirs(FULL_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

print("FULL_DATA_DIR =", FULL_DATA_DIR)
print("PROCESSED_DIR =", PROCESSED_DIR)

FULL_DATA_DIR = C:\Users\user\Documents\f25\pmldl\pmldl_assignment1\data\full_dataset
PROCESSED_DIR = C:\Users\user\Documents\f25\pmldl\pmldl_assignment1\data\processed


### Download datasets

In [9]:
urls = [
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv",
    "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv"
]

for url in urls:
    filename = os.path.join(FULL_DATA_DIR, url.split("/")[-1])
    if not os.path.exists(filename):
        print(f"Downloading {filename} ...")
        r = requests.get(url)
        with open(filename, "wb") as f:
            f.write(r.content)
    else:
        print(f"{filename} already downloaded")

Downloading C:\Users\user\Documents\f25\pmldl\pmldl_assignment1\data\full_dataset\goemotions_1.csv ...
Downloading C:\Users\user\Documents\f25\pmldl\pmldl_assignment1\data\full_dataset\goemotions_2.csv ...
Downloading C:\Users\user\Documents\f25\pmldl\pmldl_assignment1\data\full_dataset\goemotions_3.csv ...


### Read, see & concat

In [10]:
csv_files = [
    os.path.join(FULL_DATA_DIR, f"goemotions_{i}.csv") for i in range(1, 4)
]

dfs = [pd.read_csv(f) for f in csv_files]
df = pd.concat(dfs, ignore_index=True)


In [12]:
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [17]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

#### Filter only clear emotions

In [18]:
df_filtered = df[df['example_very_unclear'] != 1]

### Take only needed cols

In [47]:
selected_ems = ['anger', 'confusion','disgust',
                'excitement', 'fear', 'joy',
                'love', 'sadness', 'surprise','neutral']

In [48]:
df_ems = df_filtered[['text', 'id'] + selected_ems]
df_ems.head()

Unnamed: 0,text,id,anger,confusion,disgust,excitement,fear,joy,love,sadness,surprise,neutral
0,That game hurt.,eew5j0j,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,0,0,0,0,0,0,1,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,0,0,0,0,0,0,0,0,0,1
5,Right? Considering it’s such an important docu...,eespn2i,0,0,0,0,0,0,0,0,0,0


In [49]:
df_ems = df_ems[df_ems[selected_ems].sum(axis=1) > 0]
df_ems.head()

Unnamed: 0,text,id,anger,confusion,disgust,excitement,fear,joy,love,sadness,surprise,neutral
0,That game hurt.,eew5j0j,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,0,0,0,0,0,0,1,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,0,0,0,0,0,0,0,0,0,1
10,"I have, and now that you mention it, I think t...",ed9w1hm,0,0,0,0,0,0,0,0,0,1


### Balance emotions

In [50]:
df_counts = df_ems[selected_ems].sum()
min_count = df_counts.min()
print(min_count)

3197


In [51]:
df_balans = pd.concat(
    [df_ems[df_ems[emo] == 1].sample(min_count, random_state=42)
     for emo in selected_ems]
).drop_duplicates()
df_balans.head()

Unnamed: 0,text,id,anger,confusion,disgust,excitement,fear,joy,love,sadness,surprise,neutral
196200,Your link won't load and are you seriously cit...,edhoces,1,0,0,0,0,0,0,0,0,0
44388,"Don't just distance yourself, BLOCK them",edksr1y,1,0,0,0,0,0,0,0,0,0
173462,I once overheard my grandpa telling my brother...,eeaos50,1,0,0,0,0,0,0,0,0,0
76641,Shh don't spoil it for us,edv2a1w,1,0,0,0,0,0,0,0,0,0
205425,Why are you so angry?,eehtgt8,1,0,0,0,0,0,0,0,0,0


### Divide to train-val-split

In [52]:
df_balans.size

317472

In [58]:
X = df_balans[['text']]
y = df_balans[selected_ems]

# train + temp (80/20)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# temp -> val/test (50/50)
mskf = MultilabelStratifiedKFold(n_splits=2, shuffle=True, random_state=42)

val_idx, test_idx = next(mskf.split(X_temp, y_temp))

X_val = X_temp.iloc[val_idx]
y_val = y_temp.iloc[val_idx]
X_test = X_temp.iloc[test_idx]
y_test = y_temp.iloc[test_idx]

In [59]:
print("Train:", len(X_train))
print("Validation:", len(X_val))
print("Test:", len(X_test))

Train: 21164
Validation: 2649
Test: 2643


In [61]:
X_train.assign(**y_train).to_csv(os.path.join(PROCESSED_DIR, "train.tsv"), sep="\t", index=False)
X_val.assign(**y_val).to_csv(os.path.join(PROCESSED_DIR, "val.tsv"), sep="\t", index=False)
X_test.assign(**y_test).to_csv(os.path.join(PROCESSED_DIR, "test.tsv"), sep="\t", index=False)