In [1]:
#%python -m venv myenv

# Activate the virtual environment
#source myenv/bin/activate  # On Unix/Linux
# %./myenv/Scripts/activate  # On Windows

# # Install required packages
# %pip install torch torchvision

%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 tf-keras transformers datasets evaluate scikit-learn transformers[torch] --quiet

import pandas as pd
import torch
from collections import Counter
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder 
from transformers import AutoTokenizer
from transformers import AutoConfig

print(f"\nIs Cuda Available: {torch.cuda.is_available()}")
print(f"Cuda Version: {torch.version.cuda}")



Note: you may need to restart the kernel to use updated packages.

Is Cuda Available: True
Cuda Version: 12.4


In [2]:
# Load your CSV file
df = pd.read_csv("../data/raw/reddit_mental_health_dataset.csv")

# Replace common encoding issues
df['text'] = df['text'].str.replace("â€™", "'", regex=False)
df['text'] = df['text'].str.replace("â€œ", '"', regex=False)
df['text'] = df['text'].str.replace("â€�", '"', regex=False)
df['text'] = df['text'].str.replace("â€“", "-", regex=False)
df['text'] = df['text'].str.replace("â€˜", "'", regex=False)
df['text'] = df['text'].str.replace("Ã", " ", regex=False)
df['text'] = df['text'].str.replace("&amp", "&", regex=False)
df = df[df['text'].notna()]                # Drop NaN values
df = df[df['text'].str.strip() != ""]      # Drop empty/whitespace-only strings

# Save cleaned data
df.to_csv("../data/processed/cleaned_reddit_mental_health_dataset.csv", index=False)

In [3]:
# Fine-Tuning j-hartmann/emotion-english-distilroberta-base on Custom Emotion Dataset

# Load Dataset
dataset = load_dataset("csv", data_files="../data/processed/cleaned_reddit_mental_health_dataset.csv")
split_dataset = dataset["train"].train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# model_name = "j-hartmann/emotion-english-distilroberta-base"
model_name = "SchuylerH/bert-multilingual-go-emtions"

config = AutoConfig.from_pretrained(model_name)
config.num_labels = 28
id2label = config.id2label
label2id = config.label2id
config.problem_type = "single_label_classification"

label2id['anger'] = 2
label2id['optimism'] = 20
del label2id['LABEL_2']

print(id2label)
print(label2id)

{0: 'admiration', 1: 'amusement', 2: 'anger', 3: 'annoyance', 4: 'approval', 5: 'caring', 6: 'confusion', 7: 'curiosity', 8: 'desire', 9: 'disappointment', 10: 'disapproval', 11: 'disgust', 12: 'embarrassment', 13: 'excitement', 14: 'fear', 15: 'gratitude', 16: 'grief', 17: 'joy', 18: 'love', 19: 'nervousness', 20: 'optimism', 21: 'pride', 22: 'realization', 23: 'relief', 24: 'remorse', 25: 'sadness', 26: 'surprise', 27: 'neutral'}
{'admiration': 0, 'amusement': 1, 'disapproval': 10, 'disgust': 11, 'embarrassment': 12, 'excitement': 13, 'fear': 14, 'gratitude': 15, 'grief': 16, 'joy': 17, 'love': 18, 'nervousness': 19, 'anger': 2, 'pride': 21, 'realization': 22, 'relief': 23, 'remorse': 24, 'sadness': 25, 'surprise': 26, 'neutral': 27, 'annoyance': 3, 'approval': 4, 'caring': 5, 'confusion': 6, 'curiosity': 7, 'desire': 8, 'disappointment': 9, 'optimism': 20}


In [None]:
# Map Custom Labels
# Example: converting mental health tags to Negative/Neutral/Positive
def map_labels(example):
    mapping = {
        0: "nervousness",   # Stress
        1: "sadness",   # Depression
        2: "neutral",    # Bipolar (mixed, acceptable)
        3: "confusion",   # Personality disorder
        4: "nervousness"    # Anxiety
    }
    example["label"] = label2id[mapping[example["target"]]]
    return example
split_dataset = split_dataset.map(map_labels)

Map:   0%|          | 0/4485 [00:00<?, ? examples/s]

Map:   0%|          | 0/1122 [00:00<?, ? examples/s]

In [6]:
# from datasets import ClassLabel

# # Create a ClassLabel object with all 28 label names from model config
# class_labels = ClassLabel(num_classes=28, names=list(label2id.keys()))

# split_dataset = split_dataset.cast_column("target", class_labels)
# print(split_dataset["train"]['target'])

In [None]:
# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding="max_length")

tokenized_dataset = split_dataset.map(tokenize_function, batched=True)

print(Counter(split_dataset["train"]["label"]))

Map:   0%|          | 0/4485 [00:00<?, ? examples/s]

Map:   0%|          | 0/1122 [00:00<?, ? examples/s]

Counter({19: 1780, 25: 943, 27: 896, 6: 866})


In [8]:
# Load Pretrained Model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 5. Training Setup
accuracy = evaluate.load("accuracy")

# print(accuracy)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    return accuracy.compute(predictions=preds, references=labels)


# os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="../models/custom-emotion-model/results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    learning_rate = 2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    # warmup_ratio=0.1,
    # warmup_steps=100,
    lr_scheduler_type="linear",
    fp16=True,                     # Mixed precision
    logging_dir="../models/custom-emotion-model/logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  
    greater_is_better=True,
    report_to=None            
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# 6. Train the Model
trainer.train()




  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.813232,0.713904
2,0.867300,0.67038,0.776292
3,0.867300,0.7606,0.786096
4,0.311800,0.813664,0.816399
5,0.311800,0.921647,0.81016


TrainOutput(global_step=1405, training_loss=0.46544961623874, metrics={'train_runtime': 397.7429, 'train_samples_per_second': 56.381, 'train_steps_per_second': 3.532, 'total_flos': 5901642795110400.0, 'train_loss': 0.46544961623874, 'epoch': 5.0})

In [23]:
# Test on Sample

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

text = "I am feeling anxious"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
outputs = model(**inputs)
predicted_label = outputs.logits.argmax().item()

print(outputs.logits)
print(outputs.logits.argmax().item())

print("Predicted Emotion:", id2label[predicted_label])

tensor([[-5.4414, -3.6797, -4.5000, -5.2109, -4.3086, -2.9062, -1.1855, -4.1406,
         -2.9023, -1.6738, -4.5508, -2.3770, -0.7803, -2.9219, -0.1962, -3.5410,
         -1.4629, -2.7109, -5.7578,  6.9883, -3.2051, -2.6602, -3.3770, -1.4453,
         -3.2852,  0.9839, -2.8906, -2.0996]], device='cuda:0',
       grad_fn=<ToCopyBackward0>)
19
Predicted Emotion: nervousness


In [None]:
# # Save the Fine-Tuned Model
# model.save_pretrained("../models/custom-emotion-model")
# tokenizer.save_pretrained("../models/custom-emotion-model")

In [25]:
# Base Model for evaluation

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Load model and tokenizer  
# model_name = "j-hartmann/emotion-english-distilroberta-base"
model_name = "SchuylerH/bert-multilingual-go-emtions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model2 = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2.to(device)

# Run prediction on input text
text = "I am stressed out of my mind! I am done with this!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
outputs = model2(**inputs)

# Extract predicted label
logits = outputs.logits
probs = F.softmax(logits, dim=1)
print(probs)

predicted_class = torch.argmax(probs).item()
print(predicted_class)

# Get hardcoded label names for "SchuylerH/bert-multilingual-go-emtions"
label_names = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 
               'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 
               'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 
               'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 
               'relief', 'remorse', 'sadness', 'surprise', 'neutral']

positive_emotions = {"admiration","amusement", "approval", "caring", "excitement", "gratitude", "joy", "love", "optimism", "pride", "relief"}
negative_emotions = {"anger", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "remorse", "sadness", "annoyance"}
neutral_emotions = {"neutral", "confusion", "curiosity", "desire", "realization", "surprise", "nervousness"}

def map_emotion(label):
    if label_names[label] in positive_emotions:
        return "Positive"
    elif label_names[label] in negative_emotions:
        return "Negative"
    else:
        return "Neutral"

print("Predicted emotion:", label_names[predicted_class])
print("Predicted emotion:", map_emotion(predicted_class))

tensor([[2.5253e-03, 4.5607e-04, 5.9474e-04, 7.8180e-04, 1.1512e-02, 9.5014e-03,
         5.5871e-04, 6.6844e-04, 5.8952e-04, 3.0818e-02, 2.9249e-04, 9.6733e-03,
         6.3770e-03, 2.1248e-03, 1.9716e-02, 3.7302e-04, 4.9179e-02, 3.7416e-03,
         3.3376e-04, 1.4382e-01, 3.7564e-04, 9.6262e-03, 4.5922e-03, 3.6339e-02,
         1.5353e-03, 6.4790e-01, 1.1208e-03, 4.8769e-03]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
25
Predicted emotion: sadness
Predicted emotion: Negative


In [None]:
#Evaluate the selected Model

from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
import numpy as np

# print(tokenized_dataset)

# Ensure correct format
tokenized_dataset.set_format(type='torch', columns=['Unnamed: 0', 'text', 'title', 'target', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

eval_dataloader = DataLoader(tokenized_dataset['test'], batch_size=16)

model2.eval()
model2.to(device)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model2(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

all_labels_list = list(map(lambda x: id2label[x], all_labels))
all_preds_list = list(map(lambda x: id2label[x], all_preds))


# Metrics (Not Viable with Current Dataset since dataset mapping can pertain to different emotions in certain cases (Stress : Neutral, Negative); 
# Rely on the Zero-Shot Classification Model Instead [Better Training Data])

print("True Labels:")
print(all_labels_list)
print("Predicted Labels:")
print(all_preds_list)

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("F1 Score (macro):", f1_score(all_labels, all_preds, average='macro'))
print("\nClassification Report:\n", classification_report(all_labels, all_preds, labels=label_names,target_names=label_names))


True Labels:
['nervousness', 'confusion', 'confusion', 'confusion', 'nervousness', 'sadness', 'nervousness', 'nervousness', 'sadness', 'nervousness', 'nervousness', 'nervousness', 'nervousness', 'sadness', 'confusion', 'nervousness', 'neutral', 'nervousness', 'nervousness', 'nervousness', 'nervousness', 'sadness', 'neutral', 'confusion', 'neutral', 'confusion', 'nervousness', 'nervousness', 'nervousness', 'nervousness', 'sadness', 'sadness', 'neutral', 'nervousness', 'nervousness', 'sadness', 'nervousness', 'sadness', 'sadness', 'confusion', 'confusion', 'neutral', 'confusion', 'nervousness', 'nervousness', 'sadness', 'nervousness', 'nervousness', 'nervousness', 'neutral', 'nervousness', 'neutral', 'neutral', 'confusion', 'nervousness', 'nervousness', 'sadness', 'nervousness', 'nervousness', 'confusion', 'nervousness', 'confusion', 'confusion', 'confusion', 'sadness', 'confusion', 'nervousness', 'neutral', 'confusion', 'confusion', 'nervousness', 'neutral', 'nervousness', 'nervousness'

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Use the Zero-Shot Classification Model: "SchuylerH/bert-multilingual-go-emtions" to identify emotions of a user for project workflow
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Load model
emotion_model_name = "SchuylerH/bert-multilingual-go-emtions"
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
emotion_model.to(device)

# Emotion label mapping
emotion_label_names = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 
               'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 
               'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 
               'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 
               'relief', 'remorse', 'sadness', 'surprise', 'neutral']

# Superclass emotion grouping
positive_emotions = {"admiration","amusement", "approval", "caring", "excitement", "gratitude", "joy", "love", "optimism", "pride", "relief"}
negative_emotions = {"anger", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "remorse", "sadness", "annoyance"}
neutral_emotions  = {"neutral", "confusion", "curiosity", "desire", "realization", "surprise", "nervousness"}

# Assign indices for each emotion into corresponding group
positive_idx = [i for i, label in enumerate(emotion_label_names) if label in positive_emotions]
negative_idx = [i for i, label in enumerate(emotion_label_names) if label in negative_emotions]
neutral_idx  = [i for i, label in enumerate(emotion_label_names) if label in neutral_emotions]

In [13]:
# Duplicate normalized superclass probabilities across 28 emotion labels for their corresponding superclass emotion grouping
def expand_superclass_probs(superclass_probs, num_labels=28):
    extended_superclass_probs = torch.zeros(num_labels)

    # Normalize the probability values
    superclass_probs = F.softmax(superclass_probs, dim=0)  

    for idx in negative_idx:
        extended_superclass_probs[idx] = superclass_probs[0] / len(negative_idx)
    for idx in neutral_idx:
        extended_superclass_probs[idx] = superclass_probs[1] / len(neutral_idx)
    for idx in positive_idx:
        extended_superclass_probs[idx] = superclass_probs[2] / len(positive_idx)

    return extended_superclass_probs

In [14]:
# Apply weights to both probabilities tensors and sum them for the final probability tensor
def merge_probs(primary_probs, secondary_probs, alpha=0.7):
    return alpha * primary_probs + (1 - alpha) * secondary_probs

In [15]:
# Classify emotion
def classify_emotion(final_probs):
    top_idx = torch.argmax(final_probs).item()
    emotion = emotion_label_names[top_idx]
    if top_idx in positive_idx:
        superclass = "Positive"
    elif top_idx in negative_idx:
        superclass = "Negative"
    else:
        superclass = "Neutral"
    return superclass, emotion, final_probs[top_idx].item()

In [16]:
# Inference
def run_emotion_pipeline(text, regressor_probs):
    inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = emotion_model(**inputs)
    logits = outputs.logits
    primary_probs = F.softmax(logits, dim=1).squeeze()  # Desired shape: [28]

    secondary_probs = expand_superclass_probs(regressor_probs).to(primary_probs) # Desired shape: [28]
    final_probs = merge_probs(primary_probs, secondary_probs)

    superclass, sub_emotion, confidence = classify_emotion(final_probs)
    return {
        "superclass": superclass,
        "sub_emotion": sub_emotion,
        "confidence": confidence,
        "top_5_emotions": [(emotion_label_names[i], final_probs[i].item()) for i in torch.topk(final_probs, 5).indices]
    }

In [17]:
# Example usage
text_input = "My day was bad today, I almost got ran over by a car"

# Secondary model probability output (Regressor Model) to be used - Example value for now...
superclass_probs = torch.tensor([0.6, 0.3, 0.1], device=device)  # Negative, Neutral, Positive
result = run_emotion_pipeline(text_input, superclass_probs)

print("Emotion Classification Result:")
print(f"  Superclass: {result['superclass']}")
print(f"  Sub-emotion: {result['sub_emotion']} ({result['confidence']:.3f})")
print("  Top 5 Emotions:")
for label, prob in result["top_5_emotions"]:
    print(f"    - {label}: {prob:.4f}")

Emotion Classification Result:
  Superclass: Negative
  Sub-emotion: disappointment (0.327)
  Top 5 Emotions:
    - disappointment: 0.3271
    - annoyance: 0.1382
    - disgust: 0.0738
    - sadness: 0.0646
    - relief: 0.0476
