In [25]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
import torch
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm

In [11]:
df1 = pd.read_csv("../data/goemotions_1.csv")
df2 = pd.read_csv("../data/goemotions_2.csv")
df3 = pd.read_csv("../data/goemotions_3.csv")

In [12]:
df = pd.concat([df1,df2,df3] , ignore_index = True)

In [17]:
df.drop(columns = ['id', 'author', 'subreddit', 'link_id', 'parent_id',
            'created_utc', 'rater_id', 'example_very_unclear'] , inplace = True)

In [19]:
df[:1]

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [23]:
EMOTION_MAP = {
    "sadness": "sad",
    "grief": "sad",

    "nervousness": "anxious",
    "fear": "anxious",

    "anger": "angry",
    "annoyance": "angry",

    "disappointment": "stressed",
    "remorse": "stressed",

    "joy": "happy",
    "love": "happy",
    "optimism": "happy",

    "neutral": "neutral"
}

EMOTION_COLS = list(EMOTION_MAP.keys())

def get_main_emotion(row):
    """
    Chuyển multi-label → single-label
    Ưu tiên theo thứ tự trong EMOTION_MAP
    """
    for emo in EMOTION_COLS:
        if row.get(emo, 0) == 1:
            return EMOTION_MAP[emo]
    return "neutral"

df["emotion"] = df.apply(get_main_emotion, axis=1)


CRISIS_KEYWORDS = [
    "kill myself", "end my life", "suicide",
    "dont want to live", "want to die",
    "self harm", "cut myself"
]

def assign_risk(text, emotion):
    text = text.lower()
    if any(k in text for k in CRISIS_KEYWORDS):
        return "crisis"
    if emotion in ["sad", "stressed", "anxious"]:
        return "warning"
    return "normal"

df["risk"] = df.apply(lambda x: assign_risk(x["text"], x["emotion"]), axis=1)


def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text"] = df["text"].astype(str).apply(clean_text)


df_final = df[["text", "emotion", "risk"]]

print("Final dataset shape:", df_final.shape)
print(df_final.head())


train_df, temp_df = train_test_split(
    df_final,
    test_size=0.3,
    random_state=42,
    stratify=df_final["emotion"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df["emotion"]
)

print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)

train_df.to_csv("../data/train.csv", index=False)
val_df.to_csv("../data/val.csv", index=False)
test_df.to_csv("../data/test.csv", index=False)

print("✅ Dataset preparation DONE!")

Final dataset shape: (211225, 3)
                                                text  emotion     risk
1  >sexuality shouldn’t be a grouping category it...  neutral   normal
2     you do right, if you don't care then fuck 'em!  neutral   normal
3                                 man i love reddit.    happy   normal
4  [name] was nowhere near them, he was by the fa...  neutral   normal
Train: (147857, 3)
Val: (31684, 3)
Test: (31684, 3)
✅ Dataset preparation DONE!
