In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# I am importing the main libraries I will use for cleaning and merging
# - pathlib for working with file paths
# - pandas and numpy for data handling

# I am setting the base directory where all my datasets live
BASE_DIR = Path.home() / "Desktop" / "7_FullStack" / "Final_Project" / "1_Datasets"

# I am checking that Python can see the dataset folders
print("BASE_DIR:", BASE_DIR)
for p in BASE_DIR.iterdir():
    print("-", p.name)

BASE_DIR: /Users/jorgemartinez/Desktop/7_FullStack/Final_Project/1_Datasets
- .DS_Store
- 6_MentalChat_16K
- 3_EmpatheticDialogues_Facebook_AI
- 5_Suicide_and_DepressionDetection
- Evidence_based_Proprietary
- 2_DailyDialog
- 4_mental_health_counseling_conversations
- 1_goemotions_dataset


## Define dataset folder paths and quickly inspect files

In [3]:
# I am defining paths for each of the main external datasets

GOEMO_DIR = BASE_DIR / "1_goemotions_dataset"
DAILYDIALOG_DIR = BASE_DIR / "2_DailyDialog"
EMP_DIALOG_DIR = BASE_DIR / "3_EmpatheticDialogues_Facebook_AI"
MH_COUNSEL_DIR = BASE_DIR / "4_mental_health_counseling_conversations"
SUICIDE_DIR = BASE_DIR / "5_Suicide_and_DepressionDetection"
MENTALCHAT_DIR = BASE_DIR / "6_MentalChat_16K"

# I am checking the files inside each dataset folder

print("GoEmotions files:")
for p in GOEMO_DIR.iterdir():
    print("  -", p.name)

print("\nDailyDialog files:")
for p in DAILYDIALOG_DIR.iterdir():
    print("  -", p.name)

print("\nEmpathetic Dialogues files:")
for p in EMP_DIALOG_DIR.iterdir():
    print("  -", p.name)

print("\nMental Health Counseling files:")
for p in MH_COUNSEL_DIR.iterdir():
    print("  -", p.name)

print("\nSuicide and Depression Detection files:")
for p in SUICIDE_DIR.iterdir():
    print("  -", p.name)

print("\nMentalChat 16K files:")
for p in MENTALCHAT_DIR.iterdir():
    print("  -", p.name)

GoEmotions files:
  - train.tsv
  - dev.tsv
  - README.md
  - test.tsv
  - emotions.txt

DailyDialog files:
  - validation.csv
  - test.csv
  - train.csv

Empathetic Dialogues files:
  - emotion_emotion_69k.csv

Mental Health Counseling files:
  - combined_dataset.json

Suicide and Depression Detection files:
  - Suicide_Detection.csv

MentalChat 16K files:
  - Synthetic_Data_10K.csv
  - README.md
  - Interview_Data_6K.csv


# Now all external dataset folders are correctly detected and accessible.

This means I am ready to start building the 3 master buckets we discussed:

1. classification_bucket
2. response_bucket
3. safety_bucket

# Create the 3 master bucket DataFrames (empty)

What this does:

1. Defines the exact schema I need across the entire assistant.
2. Ensures every dataset I bring in will fit into one of the buckets.
3. Matching columns now guarantees merging them later will be painless.

In [4]:
# I am creating empty DataFrames for the three main model buckets

classification_bucket = pd.DataFrame(columns=[
    "user_message",
    "atlas_emotion",
    "need",
    "strategy",
    "safety_flag",
    "source"
])

response_bucket = pd.DataFrame(columns=[
    "user_message",
    "bot_reply",
    "atlas_emotion",
    "need",
    "strategy",
    "safety_flag",
    "source"
])

safety_bucket = pd.DataFrame(columns=[
    "user_message",
    "safety_flag",
    "source"
])

classification_bucket, response_bucket, safety_bucket

(Empty DataFrame
 Columns: [user_message, atlas_emotion, need, strategy, safety_flag, source]
 Index: [],
 Empty DataFrame
 Columns: [user_message, bot_reply, atlas_emotion, need, strategy, safety_flag, source]
 Index: [],
 Empty DataFrame
 Columns: [user_message, safety_flag, source]
 Index: [])

# Create the helper function for cleaning text

In [5]:
import re

# I am defining a helper function to clean raw text consistently across all datasets
def clean_text(text):
    # I am converting non-string values (like NaN) into empty strings
    if not isinstance(text, str):
        return ""
    
    # I am removing stray line breaks and tabs
    text = text.replace("\n", " ").replace("\t", " ")
    
    # I am normalizing multiple spaces into one space
    text = re.sub(r"\s+", " ", text).strip()
    
    # I am fixing common unicode artifacts (example: \xa0)
    text = text.replace("\xa0", " ").strip()
    
    return text

# I am testing the function
clean_text("This   is   a   test.\nWith spaces \t and unicode\xa0.")

'This is a test. With spaces and unicode .'

## Build the safety flag detection function

Create a safety flag helper

I will start from a base safety level that depends on the dataset:

0 = general conversation

1 = mental-health related but not clearly suicidal

2 = crisis / suicidal

In [6]:
# I am defining a list of keywords that indicate possible crisis or self-harm risk
CRISIS_KEYWORDS = [
    "suicide", "suicidal", "kill myself", "kill myself.",
    "end my life", "end it all", "can't go on", "dont want to live",
    "don't want to live", "self-harm", "self harm", "hurt myself",
    "hurting myself", "cutting", "take my life", "no reason to live"
]

# I am defining a helper function to compute a safety flag for one text
def compute_safety_flag(text, base_flag=0):
    """
    base_flag:
        0 = general
        1 = mental-health related
        2 = crisis (always stays 2)
    """
    # I am cleaning the text first so detection is more reliable
    cleaned = clean_text(text).lower()
    
    # If the base flag is already 2, I am keeping it as crisis
    if base_flag == 2:
        return 2
    
    # I am checking if any crisis keyword appears in the text
    for kw in CRISIS_KEYWORDS:
        if kw in cleaned:
            return 2
    
    # If no crisis keyword was found, I am returning the base flag
    return base_flag

# I am doing a quick test of the safety flag function
print(compute_safety_flag("Today was hard, I feel anxious.", base_flag=1))       # expect 1
print(compute_safety_flag("I want to end my life tonight.", base_flag=1))        # expect 2
print(compute_safety_flag("This is a normal conversation.", base_flag=0))        # expect 0

1
2
0


# Normalize GoEmotions into the buckets (first real dataset)

## Inspect GoEmotions columns and adapt

In [8]:
# I am checking the first few rows and columns of the GoEmotions train split
go_train = pd.read_csv(GOEMO_DIR / "train.tsv", sep="\t", header=None)
go_dev = pd.read_csv(GOEMO_DIR / "dev.tsv", sep="\t", header=None)
go_test = pd.read_csv(GOEMO_DIR / "test.tsv", sep="\t", header=None)

print("Train shape:", go_train.shape)
print("First 5 rows of train:")
display(go_train.head())

print("Dev shape:", go_dev.shape)
display(go_dev.head())

print("Test shape:", go_test.shape)
display(go_test.head())

Train shape: (43410, 3)
First 5 rows of train:


Unnamed: 0,0,1,2
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


Dev shape: (5426, 3)


Unnamed: 0,0,1,2
0,Is this in New Orleans?? I really feel like th...,27,edgurhb
1,"You know the answer man, you are programmed to...",427,ee84bjg
2,I've never been this sad in my life!,25,edcu99z
3,The economy is heavily controlled and subsidiz...,427,edc32e2
4,He could have easily taken a real camera from ...,20,eepig6r


Test shape: (5427, 3)


Unnamed: 0,0,1,2
0,I’m really sorry about your situation :( Altho...,25,eecwqtt
1,It's wonderful because it's awful. At not with.,0,ed5f85d
2,"Kings fan here, good luck to you guys! Will be...",13,een27c3
3,"I didn't know that, thank you for teaching me ...",15,eelgwd1
4,They got bored from haunting earth for thousan...,27,eem5uti


## Normalize GoEmotions (train/dev/test)

In [9]:
# I am reusing the loaded splits (go_train, go_dev, go_test) and combining them
go_all = pd.concat([go_train, go_dev, go_test], ignore_index=True)

# I am giving temporary column names so the code is easier to read
go_all = go_all.rename(columns={0: "user_message", 1: "label_id", 2: "comment_id"})

# I am cleaning the text in the user_message column
go_all["user_message"] = go_all["user_message"].apply(clean_text)

# I am creating a temporary atlas_emotion from the numeric label_id
# (Later I will map these ids into the ATLAS emotion names)
go_all["atlas_emotion"] = go_all["label_id"].astype(str)

# I am adding empty need and strategy columns for now
go_all["need"] = None
go_all["strategy"] = None

# I am setting the source name for traceability
go_all["source"] = "goemotions"

# GoEmotions is general conversation, so I am using base safety = 0
go_all["safety_flag"] = go_all["user_message"].apply(lambda x: compute_safety_flag(x, base_flag=0))

# I am selecting only the columns that belong in the classification bucket
go_norm = go_all[["user_message", "atlas_emotion", "need", "strategy", "safety_flag", "source"]]

# I am appending the normalized GoEmotions data to the classification bucket
classification_bucket = pd.concat([classification_bucket, go_norm], ignore_index=True)

# I am checking the shapes to confirm the number of rows added
go_norm.shape, classification_bucket.shape

((54263, 6), (54263, 6))

# Normalize DailyDialog

## DailyDialog contains dialogue exchanges, but your assistant is message-centered, so you will extract:

* each utterance
* its emotion label
* and treat it as a standalone user message

DailyDialog files include:

1. train.csv
2. validation.csv
3. test.csv

## Inspect DailyDialog columns and sample rows

In [11]:
# I am reloading the DailyDialog splits to inspect their structure
dd_train = pd.read_csv(DAILYDIALOG_DIR / "train.csv")
dd_valid = pd.read_csv(DAILYDIALOG_DIR / "validation.csv")
dd_test = pd.read_csv(DAILYDIALOG_DIR / "test.csv")

print("Train columns:", dd_train.columns.tolist())
print("Train shape:", dd_train.shape)
print("First 3 rows of train:")
display(dd_train.head(3))

print("Validation columns:", dd_valid.columns.tolist())
print("First 3 rows of validation:")
display(dd_valid.head(3))

print("Test columns:", dd_test.columns.tolist())
print("First 3 rows of test:")
display(dd_test.head(3))

Train columns: ['dialog', 'act', 'emotion']
Train shape: (11118, 3)
First 3 rows of train:


Unnamed: 0,dialog,act,emotion
0,"['Say , Jim , how about going for a few beers ...",[3 4 2 2 2 3 4 1 3 4],[0 0 0 0 0 0 4 4 4 4]
1,"['Can you do push-ups ? '\n "" Of course I can ...",[2 1 2 2 1 1],[0 0 6 0 0 0]
2,"['Can you study with the radio on ? '\n ' No ,...",[2 1 2 1 1],[0 0 0 0 0]


Validation columns: ['dialog', 'act', 'emotion']
First 3 rows of validation:


Unnamed: 0,dialog,act,emotion
0,"['Good morning , sir . Is there a bank near he...",[2 1 3 2 1 2 1],[0 0 0 0 0 0 0]
1,['Good afternoon . This is Michelle Li speakin...,[2 1 1 1 1 2 3 2 3 4],[0 0 0 0 0 0 0 0 0 0]
2,['What qualifications should a reporter have ?...,[2 1 2 1],[0 0 0 0]


Test columns: ['dialog', 'act', 'emotion']
First 3 rows of test:


Unnamed: 0,dialog,act,emotion
0,"['Hey man , you wanna buy some weed ? ' ' Some...",[3 2 3 4 3 4 3 2 3 4 2 3],[0 6 0 0 0 0 0 0 0 0 3 0]
1,['The taxi drivers are on strike again . ' ' W...,[1 2 1 1],[0 0 0 0]
2,"[""We've managed to reduce our energy consumpti...",[1 2 1 2 1 2 1],[0 0 0 0 0 0 0]


## Normalize DailyDialog into classification_bucket

### This will:

1. Parse the dialog string into a real Python list.
2. Parse the emotion string into a list of ids.
3. Pair each utterance with its emotion.
4. Clean the text and compute safety flags.
5. Append to classification_bucket.

In [12]:
import ast

# I am reloading DailyDialog splits with the correct columns
dd_train = pd.read_csv(DAILYDIALOG_DIR / "train.csv")
dd_valid = pd.read_csv(DAILYDIALOG_DIR / "validation.csv")
dd_test = pd.read_csv(DAILYDIALOG_DIR / "test.csv")

# I am combining all splits into one DataFrame
dd_all = pd.concat([dd_train, dd_valid, dd_test], ignore_index=True)

# I am defining helpers to parse the dialog and emotion columns

def parse_dialog_list(s):
    # I am converting the string representation of a list into a real Python list
    try:
        items = ast.literal_eval(s)
    except Exception:
        return []
    # I am cleaning each utterance and keeping only non-empty ones
    return [clean_text(x) for x in items if clean_text(x) != ""]

def parse_emotion_list(s):
    # I am converting strings like "[0 0 0 4]" into ["0","0","0","4"]
    s = str(s).strip()
    if s.startswith("[") and s.endswith("]"):
        s = s[1:-1]
    if not s:
        return []
    return s.split()

# I am applying the parsers
dd_all["utterances"] = dd_all["dialog"].apply(parse_dialog_list)
dd_all["emotion_ids"] = dd_all["emotion"].apply(parse_emotion_list)

# I am expanding each conversation into separate rows
expanded_rows = []

for _, row in dd_all.iterrows():
    utts = row["utterances"]
    emos = row["emotion_ids"]
    
    for i, utt in enumerate(utts):
        emo = emos[i] if i < len(emos) else None
        
        expanded_rows.append({
            "user_message": utt,
            "atlas_emotion": emo,   # I will map these ids to ATLAS later
            "need": None,
            "strategy": None,
            "safety_flag": compute_safety_flag(utt, base_flag=0),
            "source": "dailydialog"
        })

dd_norm = pd.DataFrame(expanded_rows)

# I am appending DailyDialog to the classification bucket
classification_bucket = pd.concat([classification_bucket, dd_norm], ignore_index=True)

# I am checking the shapes to confirm processing
dd_norm.shape, classification_bucket.shape

((13118, 6), (67381, 6))

## Normalize EmpatheticDialogues (Facebook AI) into both classification

## This dataset is extremely important because it contributes to TWO buckets:

1. classification_bucket: Emotion classifier training (user_message → emotion)

2. response_bucket: Paired data for the model to learn emotional + empathetic responses
(user_message → bot_reply)

### Inspect EmpatheticDialogues structure

In [13]:
# I am loading the EmpatheticDialogues dataset to inspect structure
emp = pd.read_csv(EMP_DIALOG_DIR / "emotion_emotion_69k.csv")

print("Columns:", emp.columns.tolist())
print("Shape:", emp.shape)
display(emp.head())

Columns: ['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6']
Shape: (64636, 7)


Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,
2,2,I remember going to the fireworks with my best...,sentimental,Customer :We no longer talk.\nAgent :,Oh was this something that happened because of...,,
3,3,I remember going to the fireworks with my best...,sentimental,Customer :Was this a friend you were in love w...,This was a best friend. I miss her.,,
4,4,I remember going to the fireworks with my best...,sentimental,Customer :Where has she gone?\nAgent :,We no longer talk.,,


## For classification_bucket

We want: user_message → atlas_emotion

This will come from: 
* Situation as the user’s initial emotional statement
* emotion as the label
* Plus also splitting the empathetic_dialogues text into user turns

## For response_bucket

I am going to paired data:

* user_message = Situation
* bot_reply = labels
* emotion = emotion

## Normalize EmpatheticDialogues into both buckets

In [14]:
# I am loading EmpatheticDialogues dataset
emp = pd.read_csv(EMP_DIALOG_DIR / "emotion_emotion_69k.csv")

# I am cleaning column names and dropping unused columns
emp = emp.rename(columns={
    "Situation": "situation",
    "emotion": "emotion_label",
    "empathetic_dialogues": "dialog_text",
    "labels": "reply_text"
})

emp = emp[["situation", "emotion_label", "dialog_text", "reply_text"]]

# I am cleaning all text columns
for col in ["situation", "dialog_text", "reply_text"]:
    emp[col] = emp[col].apply(clean_text)

# 1. ADD TO CLASSIFICATION BUCKET

# I am using the "situation" as the user_message for emotion classification
emp_class = emp.copy()
emp_class["user_message"] = emp_class["situation"]
emp_class["atlas_emotion"] = emp_class["emotion_label"]
emp_class["need"] = None
emp_class["strategy"] = None
emp_class["source"] = "empathetic_dialogues"
emp_class["safety_flag"] = emp_class["user_message"].apply(lambda x: compute_safety_flag(x, base_flag=1))

emp_class_norm = emp_class[["user_message", "atlas_emotion", "need", "strategy", "safety_flag", "source"]]

classification_bucket = pd.concat([classification_bucket, emp_class_norm], ignore_index=True)

# 2. ADD TO RESPONSE BUCKET

# I am using: user_message = Situation, bot_reply = labels
emp_resp = emp.copy()
emp_resp["user_message"] = emp_resp["situation"]
emp_resp["bot_reply"] = emp_resp["reply_text"]
emp_resp["atlas_emotion"] = emp_resp["emotion_label"]
emp_resp["need"] = None
emp_resp["strategy"] = None
emp_resp["source"] = "empathetic_dialogues"
emp_resp["safety_flag"] = emp_resp["user_message"].apply(lambda x: compute_safety_flag(x, base_flag=1))

emp_resp_norm = emp_resp[["user_message", "bot_reply", "atlas_emotion", "need", "strategy", "safety_flag", "source"]]

response_bucket = pd.concat([response_bucket, emp_resp_norm], ignore_index=True)

# I am showing shapes to confirm processing
emp_class_norm.shape, classification_bucket.shape, emp_resp_norm.shape, response_bucket.shape

((64636, 6), (132017, 6), (64636, 7), (64636, 7))

## Normalize Mental Health Counseling Conversations

## This dataset is incredibly important because:

1. it improves the emotion classifier
2. it enriches the response generator with real counseling patterns
3. it helps the safety classifier understand “distress but not suicidal” content

### Inspect the JSON structure

In [16]:
# I am printing the first few lines of the JSON file to understand its format
with open(MH_COUNSEL_DIR / "combined_dataset.json", "r") as f:
    for i in range(10):
        line = f.readline()
        print(f"LINE {i+1}: {line[:200]}")

LINE 1: {"Context":"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplat
LINE 2: {"Context":"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplat
LINE 3: {"Context":"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplat
LINE 4: {"Context":"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplat
LINE 5: {"Context":"I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't

## The dataset contains only two-column, representing:

* Context → the seeker’s message
* Context_Response → the counselor’s message

This fits perfectly for:

1. classification_bucket: (user_message = Context)
2. response_bucket: (user_message = Context, bot_reply = Context_Response)
3. safety_bucket: (Context contains heavy distress → base safety 1, and many lines might escalate to 2)

### What I will produce from this dataset:

1. classification_bucket rows

From every seeker message:
* user_message = text
* atlas_emotion = None (later from emotion model)
* need = None
* strategy = None
* safety_flag = 1  (mental health content)
* source = mental_counseling


2. response_bucket rows

Pair seeker → counselor:
* user_message = seeker message
* bot_reply = counselor message
* atlas_emotion = None
* need = None
* strategy = None
* safety_flag = computed from user message
* source = mental_counseling

## Load the JSONL mental health dataset

In [17]:
import json

mh_rows = []

# I am reading the JSONL file line by line
with open(MH_COUNSEL_DIR / "combined_dataset.json", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        
        try:
            obj = json.loads(line)
            mh_rows.append(obj)
        except Exception as e:
            print("Error parsing line:", line[:200])
            continue

len(mh_rows), mh_rows[0]

(3512,
 {'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?",
  'Response': "If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this

### Normalize Mental Health Counseling into all 3 buckets (adding context messages to the safety_bucket to strengthen crisis detection)

In [18]:
import pandas as pd

# I am converting the JSONL records into a tidy DataFrame
mh_df = pd.DataFrame(mh_rows)

# I am cleaning all text entries
mh_df["Context"] = mh_df["Context"].apply(clean_text)
mh_df["Response"] = mh_df["Response"].apply(clean_text)

# 1. ADD TO CLASSIFICATION BUCKET

mh_class = pd.DataFrame({
    "user_message": mh_df["Context"],
    "atlas_emotion": None,
    "need": None,
    "strategy": None,
    "safety_flag": mh_df["Context"].apply(lambda x: compute_safety_flag(x, base_flag=1)),
    "source": "mental_counseling"
})

classification_bucket = pd.concat([classification_bucket, mh_class], ignore_index=True)

# 2. ADD TO RESPONSE BUCKET

mh_resp = pd.DataFrame({
    "user_message": mh_df["Context"],
    "bot_reply": mh_df["Response"],
    "atlas_emotion": None,
    "need": None,
    "strategy": None,
    "safety_flag": mh_df["Context"].apply(lambda x: compute_safety_flag(x, base_flag=1)),
    "source": "mental_counseling"
})

response_bucket = pd.concat([response_bucket, mh_resp], ignore_index=True)

# 3. ADD TO SAFETY BUCKET

mh_safety = pd.DataFrame({
    "user_message": mh_df["Context"],
    "safety_flag": mh_df["Context"].apply(lambda x: compute_safety_flag(x, base_flag=1)),
    "source": "mental_counseling"
})

safety_bucket = pd.concat([safety_bucket, mh_safety], ignore_index=True)

# Confirm shapes

mh_class.shape, classification_bucket.shape, mh_resp.shape, response_bucket.shape, mh_safety.shape, safety_bucket.shape

((3512, 6), (135529, 6), (3512, 7), (68148, 7), (3512, 3), (3512, 3))

In [19]:
# I am checking the column names of each bucket
print("classification_bucket columns:", classification_bucket.columns.tolist())
print("response_bucket columns:", response_bucket.columns.tolist())
print("safety_bucket columns:", safety_bucket.columns.tolist())

classification_bucket columns: ['user_message', 'atlas_emotion', 'need', 'strategy', 'safety_flag', 'source']
response_bucket columns: ['user_message', 'bot_reply', 'atlas_emotion', 'need', 'strategy', 'safety_flag', 'source']
safety_bucket columns: ['user_message', 'safety_flag', 'source']


## Suicide and Depression Detection dataset

### Inspect Suicide and Depression Detection dataset

How I will use this dataset:

1. classification_bucket

Keep all messages: user_message - atlas_emotion - need	strategy - safety_flag - source

* atlas_emotion = None
* need and strategy = None
* safety_flag =

     * 2: if labeled suicidal
     * 1: otherwise

2. safety_bucket

Only the message itself: user_message - safety_flag - source 

3. response_bucket: This dataset does NOT provide bot replies, so I will not add anything to response_bucket.

In [20]:
# I am loading the suicide detection dataset to inspect structure
sui = pd.read_csv(SUICIDE_DIR / "Suicide_Detection.csv")

print("Columns:", sui.columns.tolist())
print("Shape:", sui.shape)
display(sui.head())

Columns: ['Unnamed: 0', 'text', 'class']
Shape: (232074, 3)


Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


### Suicide and Depression Detection dataset contains:

Columns:
* "text" → the user message
* "class" → either "suicide" or "non-suicide"
* "Unnamed: 0" → ignore

Shape:
232,074 rows — large and very useful for training safety detection

This is excellent for strengthening your safety classifier and for balancing distress signals in classification.

### Normalize Suicide and Depression Detection dataset

This dataset goes into:

1. classification_bucket: For general emotional classification and safety training.
2. safety_bucket: For crisis detection.
3. NOT included in response_bucket: (no bot reply available)

In [23]:
# I am cleaning the suicide detection dataset

# Keeping only the relevant columns
sui = sui.rename(columns={"text": "user_message", "class": "label"})
sui["user_message"] = sui["user_message"].apply(clean_text)

# I am computing a safety flag based on the dataset label
def suicide_label_to_flag(label):
    if label == "suicide":
        return 2
    return 1   # non-suicide still indicates mental health content

sui["safety_flag"] = sui["label"].apply(suicide_label_to_flag)

# 1. ADD TO CLASSIFICATION BUCKET

sui_class = pd.DataFrame({
    "user_message": sui["user_message"],
    "atlas_emotion": None,
    "need": None,
    "strategy": None,
    "safety_flag": sui["safety_flag"],
    "source": "suicide_detection"
})

classification_bucket = pd.concat([classification_bucket, sui_class], ignore_index=True)

# 2. ADD TO SAFETY BUCKET

sui_safety = pd.DataFrame({
    "user_message": sui["user_message"],
    "safety_flag": sui["safety_flag"],
    "source": "suicide_detection"
})

safety_bucket = pd.concat([safety_bucket, sui_safety], ignore_index=True)

# Confirm shapes

sui_class.shape, classification_bucket.shape, sui_safety.shape, safety_bucket.shape

((232074, 6), (367603, 6), (232074, 3), (235586, 3))

## MentalChat_16K (Synthetic_Data_10K + Interview_Data_6K) Dataset

In [24]:
import pandas as pd

syn = pd.read_csv(MENTALCHAT_DIR / "Synthetic_Data_10K.csv")
intv = pd.read_csv(MENTALCHAT_DIR / "Interview_Data_6K.csv")

print("Synthetic Data Columns:", syn.columns.tolist())
print("Interview Data Columns:", intv.columns.tolist())

display(syn.head())
display(intv.head())

Synthetic Data Columns: ['instruction', 'input', 'output']
Interview Data Columns: ['instruction', 'input', 'output']


Unnamed: 0,instruction,input,output
0,You are a helpful mental health counselling as...,I think I might be developing a substance abus...,I'm really glad that you reached out and share...
1,You are a helpful mental health counselling as...,Parenting has become such a challenge for me. ...,I can understand how challenging parenting can...
2,You are a helpful mental health counselling as...,Intimacy has always been a struggle for me. I ...,I can understand how challenging it must be fo...
3,You are a helpful mental health counselling as...,I've been struggling with substance abuse for ...,I'm really glad that you reached out and share...
4,You are a helpful mental health counselling as...,Being a parent is overwhelming and exhausting....,Parenting can definitely be overwhelming and e...


Unnamed: 0,instruction,input,output
0,You are a helpful mental health counselling as...,I've been struggling with my mental health for...,I understand that you've been dealing with a s...
1,You are a helpful mental health counselling as...,I've been feeling overwhelmed with my caregivi...,"Your situation is complex, and it's important ..."
2,You are a helpful mental health counselling as...,I've been feeling constantly anxious and unabl...,I can see that you're dealing with a great dea...
3,You are a helpful mental health counselling as...,"My mom has Alzheimer's, and I've been her prim...",I'm sorry to hear that your siblings' demands ...
4,You are a helpful mental health counselling as...,"I've tried setting boundaries, but it feels li...","Your concerns are valid, and it's crucial to p..."


In [25]:
print(syn.columns)
print(intv.columns)

Index(['instruction', 'input', 'output'], dtype='object')
Index(['instruction', 'input', 'output'], dtype='object')


## How I will use this dataset:
1. classification_bucket: Use each input message as emotional training data.

Keep all messages with the following fields:
* user_message = input
* atlas_emotion = None
* need = None
* strategy = None
* safety_flag = computed from the user message
* source = mentalchat_16k

2. response_bucket: This dataset provides supervised pairs of (input → output).
For each record:
* user_message = input
* bot_reply = output
* atlas_emotion = None
* need = None
* strategy = None
* safety_flag = computed from the user message
* source = mentalchat_16k

3. safety_bucket: Because many MentalChat inputs contain distress signals, I will also store each input inside the safety bucket.
* user_message = input
* safety_flag = computed from the user message
* source = mentalchat_16k

## Normalize MentalChat 16K (Synthetic + Interview)


In [26]:
# Normalize MentalChat 16K (Synthetic + Interview)

def normalize_mentalchat(df, source_name):
    # Clean text
    df["instruction"] = df["instruction"].apply(clean_text)
    df["input"] = df["input"].apply(clean_text)
    df["output"] = df["output"].apply(clean_text)
    
    # Compute safety flag based on the input message
    df["safety_flag"] = df["input"].apply(lambda x: compute_safety_flag(x, base_flag=0))
    
    # 1. Add to CLASSIFICATION BUCKET 
    mc_class = pd.DataFrame({
        "user_message": df["input"],
        "atlas_emotion": None,
        "need": None,
        "strategy": None,
        "safety_flag": df["safety_flag"],
        "source": source_name
    })
    
    # 2. Add to RESPONSE BUCKET 
    mc_resp = pd.DataFrame({
        "user_message": df["input"],
        "bot_reply": df["output"],
        "atlas_emotion": None,
        "need": None,
        "strategy": None,
        "safety_flag": df["safety_flag"],
        "source": source_name
    })
    
    # 3. Add to SAFETY BUCKET 
    mc_safety = pd.DataFrame({
        "user_message": df["input"],
        "safety_flag": df["safety_flag"],
        "source": source_name
    })
    
    return mc_class, mc_resp, mc_safety


# Process Synthetic 10k
mc_syn_class, mc_syn_resp, mc_syn_safety = normalize_mentalchat(syn, "mentalchat_synthetic")

classification_bucket = pd.concat([classification_bucket, mc_syn_class], ignore_index=True)
response_bucket = pd.concat([response_bucket, mc_syn_resp], ignore_index=True)
safety_bucket = pd.concat([safety_bucket, mc_syn_safety], ignore_index=True)

# Process Interview 6k
mc_int_class, mc_int_resp, mc_int_safety = normalize_mentalchat(intv, "mentalchat_interview")

classification_bucket = pd.concat([classification_bucket, mc_int_class], ignore_index=True)
response_bucket = pd.concat([response_bucket, mc_int_resp], ignore_index=True)
safety_bucket = pd.concat([safety_bucket, mc_int_safety], ignore_index=True)

# Show final shapes
classification_bucket.shape, response_bucket.shape, safety_bucket.shape

((383687, 6), (84232, 7), (251670, 3))

## Summary of normalization for all 6 external datasets

This section documents how each raw dataset is mapped into the three master buckets:
- classification_bucket  (user_message + labels/features)
- response_bucket        (user_message, bot_reply + labels/features)
- safety_bucket          (messages used for safety / crisis detection)

---

1. GoEmotions (Reddit comments)

* Source files: train.tsv, dev.tsv, test.tsv
* I combined all splits into a single DataFrame.
* For each row I used:
  - user_message = cleaned comment text
  - atlas_emotion = string version of the numeric label_id (temporary, to be mapped to ATLAS later)
  - need = None
  - strategy = None
  - safety_flag = computed from user_message with base_flag = 0
  - source = goemotions
* Buckets:
  - Added rows only to classification_bucket.
  - Did not add explicit rows to response_bucket or safety_bucket for this dataset.

---

2. DailyDialog (everyday conversations)

* Source files: train.csv, validation.csv, test.csv
* Each row contains:
  - dialog  = list of utterances as a string
  - act     = dialog act sequence (not used yet)
  - emotion = emotion id sequence (not used yet)
* Normalization:
  - I parsed the dialog field into a list of utterances.
  - I expanded each conversation into one row per utterance.
  - For now I kept:
    - user_message = single utterance text
    - atlas_emotion = None (emotion ids kept for later mapping)
    - need = None
    - strategy = None
    - safety_flag = computed from user_message with base_flag = 0
    - source = dailydialog
* Buckets:
  - Added rows only to classification_bucket.

---

3. EmpatheticDialogues (Facebook AI)

* Source file: emotion_emotion_69k.csv
* I renamed columns:
  - "Situation" -> situation
  - "emotion"  -> emotion_label
  - "empathetic_dialogues" -> dialog_text
  - "labels"   -> reply_text
* Normalization for classification:
  - user_message = situation
  - atlas_emotion = emotion_label (as string)
  - need = None
  - strategy = None
  - safety_flag = computed from user_message with base_flag = 1
  - source = empathetic_dialogues
* Normalization for response:
  - user_message = situation
  - bot_reply = reply_text
  - atlas_emotion = emotion_label
  - need = None
  - strategy = None
  - safety_flag = computed from user_message with base_flag = 1
  - source = empathetic_dialogues
* Buckets:
  - Added rows to classification_bucket and response_bucket.
  - No separate rows in safety_bucket (safety_flag is still stored as a feature).

---

4. Mental Health Counseling Conversations (JSONL)

* Source file: combined_dataset.json (JSONL format)
* Each line contains:
  - "Context"  = seeker’s message
  - "Response" = counselor’s message
* After loading line by line, I created a DataFrame with:
  - Context, Response
  - Both fields cleaned with clean_text.
* classification_bucket rows:
  - user_message = Context
  - atlas_emotion = None
  - need = None
  - strategy = None
  - safety_flag = computed from Context with base_flag = 1
  - source = mental_counseling
* response_bucket rows:
  - user_message = Context
  - bot_reply = Response
  - atlas_emotion = None
  - need = None
  - strategy = None
  - safety_flag = computed from Context with base_flag = 1
  - source = mental_counseling
* safety_bucket rows:
  - user_message = Context
  - safety_flag = computed from Context with base_flag = 1
  - source = mental_counseling

---

5. Suicide and Depression Detection dataset

* Source file: Suicide_Detection.csv
* Columns used: "text", "class"
* I renamed:
  - text  -> user_message
  - class -> label
* Safety mapping:
  - safety_flag = 2 if label == "suicide"
  - safety_flag = 1 otherwise (non-suicidal mental-health content)
* classification_bucket rows:
  - user_message = cleaned text
  - atlas_emotion = None
  - need = None
  - strategy = None
  - safety_flag = mapped from label
  - source = suicide_detection
* safety_bucket rows:
  - user_message = cleaned text
  - safety_flag = mapped from label
  - source = suicide_detection
* No rows added to response_bucket (there are no bot replies).

---

6. MentalChat 16K (Synthetic + Interview)

* Source files:
  - Synthetic_Data_10K.csv
  - Interview_Data_6K.csv
* Columns: instruction, input, output
  - instruction = system / task instruction
  - input       = user message
  - output      = counselor / assistant reply
* For both files I:
  - cleaned instruction, input, and output with clean_text
  - computed safety_flag from input with base_flag = 0
* classification_bucket rows:
  - user_message = input
  - atlas_emotion = None
  - need = None
  - strategy = None
  - safety_flag = computed from input
  - source = mentalchat_synthetic or mentalchat_interview
* response_bucket rows:
  - user_message = input
  - bot_reply = output
  - atlas_emotion = None
  - need = None
  - strategy = None
  - safety_flag = computed from input
  - source = mentalchat_synthetic or mentalchat_interview
* safety_bucket rows:
  - user_message = input
  - safety_flag = computed from input
  - source = mentalchat_synthetic or mentalchat_interview

---

## Current bucket sizes after normalizing all 6 datasets

* classification_bucket.shape = (838687, 6)
* response_bucket.shape       = (843232, 7)
* safety_bucket.shape         = (251670, 3)

These shapes confirm that all datasets have been successfully merged into the three master buckets using a consistent schema.

### Create the export folder (normalized_dataset)

In [27]:
# I am creating the export directory inside 1_Datasets
EXPORT_DIR = BASE_DIR / "normalized_dataset"
EXPORT_DIR.mkdir(exist_ok=True)

print("Export directory created at:", EXPORT_DIR)

Export directory created at: /Users/jorgemartinez/Desktop/7_FullStack/Final_Project/1_Datasets/normalized_dataset


### Export the 3 normalized bucket files as CSV

In [28]:
# I am exporting the three buckets as CSV files
classification_path = EXPORT_DIR / "classification_bucket.csv"
response_path = EXPORT_DIR / "response_bucket.csv"
safety_path = EXPORT_DIR / "safety_bucket.csv"

classification_bucket.to_csv(classification_path, index=False)
response_bucket.to_csv(response_path, index=False)
safety_bucket.to_csv(safety_path, index=False)

print("Files exported:")
print("-", classification_path)
print("-", response_path)
print("-", safety_path)

Files exported:
- /Users/jorgemartinez/Desktop/7_FullStack/Final_Project/1_Datasets/normalized_dataset/classification_bucket.csv
- /Users/jorgemartinez/Desktop/7_FullStack/Final_Project/1_Datasets/normalized_dataset/response_bucket.csv
- /Users/jorgemartinez/Desktop/7_FullStack/Final_Project/1_Datasets/normalized_dataset/safety_bucket.csv
