In [1]:
#%python -m venv myenv

# Activate the virtual environment
#source myenv/bin/activate  # On Unix/Linux
# %./myenv/Scripts/activate  # On Windows

# # Install required packages
# %pip install torch torchvision
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 --quiet
%pip install tf-keras transformers datasets evaluate scikit-learn transformers[torch] --quiet

import pandas as pd
import torch
from collections import Counter
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from transformers import AutoConfig

print(f"Is Cuda Available: {torch.cuda.is_available()}")
print(f"Cuda Version: {torch.version.cuda}")

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
Is Cuda Available: True
Cuda Version: 12.4


In [2]:
# Load your CSV file
df = pd.read_csv("../data/raw/reddit_mental_health_dataset.csv")

# Replace common encoding issues
df['text'] = df['text'].str.replace("â€™", "'", regex=False)
df['text'] = df['text'].str.replace("â€œ", '"', regex=False)
df['text'] = df['text'].str.replace("â€�", '"', regex=False)
df['text'] = df['text'].str.replace("â€“", "-", regex=False)
df['text'] = df['text'].str.replace("â€˜", "'", regex=False)
df['text'] = df['text'].str.replace("Ã", " ", regex=False)
df['text'] = df['text'].str.replace("&amp", "&", regex=False)
df = df[df['text'].notna()]                # Drop NaN values
df = df[df['text'].str.strip() != ""]      # Drop empty/whitespace-only strings

# Save cleaned data
df.to_csv("../data/processed/cleaned_reddit_mental_health_dataset.csv", index=False)

In [3]:
# Fine-Tuning j-hartmann/emotion-english-distilroberta-base on Custom Emotion Dataset

# Load Dataset
dataset = load_dataset("csv", data_files="../data/processed/cleaned_reddit_mental_health_dataset.csv")  # Replace with your file
split_dataset = dataset["train"].train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# model_name = "j-hartmann/emotion-english-distilroberta-base"
model_name = "SchuylerH/bert-multilingual-go-emtions"

config = AutoConfig.from_pretrained(model_name)
config.num_labels = 28
id2label = config.id2label
label2id = config.label2id
config.problem_type = "single_label_classification"

label2id['anger'] = 2
label2id['optimism'] = 20
del label2id['LABEL_2']


print(id2label)
print(label2id)

{0: 'admiration', 1: 'amusement', 2: 'anger', 3: 'annoyance', 4: 'approval', 5: 'caring', 6: 'confusion', 7: 'curiosity', 8: 'desire', 9: 'disappointment', 10: 'disapproval', 11: 'disgust', 12: 'embarrassment', 13: 'excitement', 14: 'fear', 15: 'gratitude', 16: 'grief', 17: 'joy', 18: 'love', 19: 'nervousness', 20: 'optimism', 21: 'pride', 22: 'realization', 23: 'relief', 24: 'remorse', 25: 'sadness', 26: 'surprise', 27: 'neutral'}
{'admiration': 0, 'amusement': 1, 'disapproval': 10, 'disgust': 11, 'embarrassment': 12, 'excitement': 13, 'fear': 14, 'gratitude': 15, 'grief': 16, 'joy': 17, 'love': 18, 'nervousness': 19, 'anger': 2, 'pride': 21, 'realization': 22, 'relief': 23, 'remorse': 24, 'sadness': 25, 'surprise': 26, 'neutral': 27, 'annoyance': 3, 'approval': 4, 'caring': 5, 'confusion': 6, 'curiosity': 7, 'desire': 8, 'disappointment': 9, 'optimism': 20}


In [5]:
# 2. Map Custom Labels
# Example: converting mental health tags to happy/neutral/sad
# Uncomment and modify if necessary:
def map_labels(example):
    mapping = {
        0: "annoyance",   # Stress
        1: "sadness",   # Depression
        2: "neutral",    # Bipolar (mixed, acceptable)
        3: "confusion",   # Personality disorder
        4: "nervousness"    # Anxiety
    }
    example["label"] = label2id[mapping[example["target"]]]
    return example
split_dataset = split_dataset.map(map_labels)

Map:   0%|          | 0/4485 [00:00<?, ? examples/s]

Map:   0%|          | 0/1122 [00:00<?, ? examples/s]

In [6]:
# from datasets import ClassLabel

# # Create a ClassLabel object with all 28 label names from model config
# class_labels = ClassLabel(num_classes=28, names=list(label2id.keys()))

# split_dataset = split_dataset.cast_column("target", class_labels)
# print(split_dataset["train"]['target'])

In [7]:
# 3. Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding="max_length")

tokenized_dataset = split_dataset.map(tokenize_function, batched=True)

print(Counter(split_dataset["train"]["label"]))

Map:   0%|          | 0/4485 [00:00<?, ? examples/s]

Map:   0%|          | 0/1122 [00:00<?, ? examples/s]

Counter({25: 950, 19: 926, 3: 876, 27: 873, 6: 860})


In [12]:
# 4. Load Pretrained Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 5. Training Setup
from transformers import TrainingArguments, Trainer
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    return accuracy.compute(predictions=preds, references=labels)

import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="../models/custom-emotion-model/results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate = 2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    # warmup_steps=100,
    lr_scheduler_type="linear",
    fp16=True,                     # Mixed precision
    logging_dir="../models/custom-emotion-model/logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 6. Train the Model
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2856,0.90052,0.688057
2,0.7185,0.771535,0.754902
3,0.4815,0.873209,0.773619
4,0.3115,1.179132,0.784314
5,0.2048,1.289922,0.778075


TrainOutput(global_step=2805, training_loss=0.5525451578558448, metrics={'train_runtime': 531.8364, 'train_samples_per_second': 42.165, 'train_steps_per_second': 5.274, 'total_flos': 5901642795110400.0, 'train_loss': 0.5525451578558448, 'epoch': 5.0})

In [15]:
# 7. Test on Sample

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text = "I am ok"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
outputs = model(**inputs)
predicted_label = outputs.logits.argmax().item()

print(outputs.logits)
print(outputs.logits.argmax().item())

print("Predicted Emotion:", id2label[predicted_label])

# 8. Save the Fine-Tuned Model
model.save_pretrained("../models/custom-emotion-model")
tokenizer.save_pretrained("../models/custom-emotion-model")

tensor([[-8.1562, -7.9531, -5.2109, -2.5488, -6.8477, -5.2969, -2.9062, -6.8711,
         -8.1797, -4.6914, -6.6523, -6.4180, -6.7383, -7.9219, -8.2422, -7.9297,
         -4.8008, -6.8867, -7.2812, -1.6240, -8.4766, -7.2773, -7.6602, -7.0117,
         -5.4414,  4.7617, -7.6367, -1.4316]], device='cuda:0',
       grad_fn=<ToCopyBackward0>)
25
Predicted Emotion: sadness


('../models/custom-emotion-model\\tokenizer_config.json',
 '../models/custom-emotion-model\\special_tokens_map.json',
 '../models/custom-emotion-model\\vocab.txt',
 '../models/custom-emotion-model\\added_tokens.json',
 '../models/custom-emotion-model\\tokenizer.json')

In [10]:
# Zero-Shot Classification Model Instance

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Load model and tokenizer  
# model_name = "j-hartmann/emotion-english-distilroberta-base"
model_name = "SchuylerH/bert-multilingual-go-emtions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model2 = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2.to(device)

# Run prediction on input text
text = "Sorry!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
outputs = model2(**inputs)

# Extract predicted label
logits = outputs.logits
probs = F.softmax(logits, dim=1)
predicted_class = torch.argmax(probs).item()

# Get hardcoded label names for "SchuylerH/bert-multilingual-go-emtions"
label_names = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 
               'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 
               'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 
               'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 
               'relief', 'remorse', 'sadness', 'surprise', 'neutral']

positive_emotions = {"admiration","amusement", "approval", "caring", "excitement", "gratitude", "joy", "love", "optimism", "pride", "relief"}
negative_emotions = {"anger", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "remorse", "sadness"}
neutral_emotions = {"neutral", "confusion", "curiosity", "desire", "realization", "surprise", "nervousness"}

def map_emotion(label):
    if label_names[label] in positive_emotions:
        return "POSITIVE"
    elif label_names[label] in negative_emotions:
        return "NEGATIVE"
    else:
        return "NEUTRAL"

print("Predicted emotion:", label_names[predicted_class])
print("Predicted emotion:", map_emotion(predicted_class))

Predicted emotion: remorse
Predicted emotion: NEGATIVE
