In [None]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset, Value
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [None]:
# --- 1.1 Load Data and Get Labels ---
# GoEmotions has 28 labels (27 emotions + neutral)
dataset = load_dataset("go_emotions", "simplified")
emotion_labels = dataset["train"].features["labels"].feature.names
NUM_LABELS = len(emotion_labels) # Should be 28

# Create mappings for labels
id2label = {i: label for i, label in enumerate(emotion_labels)}
label2id = {label: i for i, label in enumerate(emotion_labels)}

print(f"Total number of labels: {NUM_LABELS}")
print(f"Example labels: {emotion_labels[:5]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Total number of labels: 28
Example labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval']


In [None]:
# --- 1.2 Function to Convert Labels to Multi-Hot Vector ---
def convert_labels_to_multihot(batch):
    # This creates a NumPy array of floats (dtype=np.float32)
    label_vector = np.zeros(NUM_LABELS, dtype=np.float32)

    for label_id in batch["labels"]:
        label_vector[label_id] = 1.0 # Ensures value is a float

    # Return a Python list (or np array) which will be cast later
    return {"labels": label_vector.tolist()} # return as list/np array for the cast step

In [None]:
# --- 1.3 Apply Conversion and Split Data ---

# Map the function to all splits (train, validation, test)
dataset_encoded = dataset.map(convert_labels_to_multihot)

# The 'labels' feature is now an array/vector of 28 floats
print("\nExample of Multi-Hot Label Vector (first sample):")
print(dataset_encoded["train"][0]["labels"])

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]


Example of Multi-Hot Label Vector (first sample):
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [None]:
# --- 2.1 Load Tokenizer ---
model_checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_checkpoint)

# Set max length to 512, which is the DistilBERT limit.
# Long journal entries will be truncated.
MAX_LENGTH = 512

# --- 2.2 Tokenization Function ---
def tokenize_and_prepare_input(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length", # Pad shorter texts to MAX_LENGTH
        max_length=MAX_LENGTH
    )

# --- 2.3 Apply Tokenization ---
# Remove the original 'text' and integer 'labels' column, keeping only the 'input_ids', 'attention_mask', and new float 'labels'
tokenized_datasets = dataset_encoded.map(
    tokenize_and_prepare_input,
    batched=True,
    remove_columns=['text', 'id']
)

# 1. Define the correct feature type for the labels column: a Sequence of Floats
# We copy the existing features and only change the type of the 'labels' column
new_features = tokenized_datasets['train'].features.copy()
# The labels column MUST be a sequence of floats for the multi-label loss function
new_features['labels'] = [Value('float32')]

# 2. Cast the entire dataset splits to the new features
tokenized_datasets = tokenized_datasets.cast(new_features)

# Change the format of the labels column to PyTorch tensors
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("\nTokenized Dataset Features:")
print(tokenized_datasets["train"].column_names)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/43410 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5427 [00:00<?, ? examples/s]


Tokenized Dataset Features:
['labels', 'input_ids', 'attention_mask']


In [None]:
# --- 3.1 Load Pre-trained Model with Classification Head ---
model = DistilBertForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id,
    # This is crucial for multi-label classification
    problem_type="multi_label_classification"
)

# --- 3.2 Verify Output Head ---
# Check the final layer to confirm it matches the number of labels
print("\nModel Classifier Head Output Dimension Check:")
# Should output 28 raw scores (logits)
print(f"Expected: {NUM_LABELS}, Actual: {model.classifier.out_features}")

# Note on Loss: When problem_type="multi_label_classification" is set,
# the Hugging Face Trainer **automatically** uses
# torch.nn.BCEWithLogitsLoss() for the loss calculation.

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Classifier Head Output Dimension Check:
Expected: 28, Actual: 28


In [None]:
# --- 4.1 Define Evaluation Metrics (Crucial for Multi-Label) ---
def compute_metrics(eval_pred):
    # Logits are the raw outputs (before sigmoid)
    logits, labels = eval_pred

    # 1. Apply Sigmoid to get probabilities (percentages)
    probabilities = 1 / (1 + np.exp(-logits))

    # 2. Convert probabilities to hard predictions (0 or 1)
    # A common threshold for multi-label is 0.5
    predictions = (probabilities > 0.5).astype(int)

    # 3. Calculate metrics
    # Micro-F1 is a good overall measure for multi-label
    micro_f1 = f1_score(labels, predictions, average="micro")

    # ROC AUC is another strong metric for multi-label probability prediction
    try:
        roc_auc = roc_auc_score(labels, probabilities, average="micro")
    except ValueError:
        roc_auc = 0.0  # Handle case where only one class is present

    return {
        "micro_f1": micro_f1,
        "roc_auc": roc_auc,
    }
# --- 4.2 Define Corrected Training Arguments ---
training_args = TrainingArguments(
    output_dir="./results_goemotions",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,

    # === THE FIX IS HERE: Changed 'evaluation_strategy' to 'eval_strategy' ===
    eval_strategy="epoch",        # This is the corrected parameter name!
    save_strategy="epoch",        # This still needs to match 'eval_strategy'
    load_best_model_at_end=True,
    report_to="none"
)

# --- 4.3 Initialize and Run Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
# You would uncomment this line to start the fine-tuning process.
print("\nTrainer successfully initialized. Ready to fine-tune.")

Epoch,Training Loss,Validation Loss,Micro F1,Roc Auc
1,0.0921,0.086935,0.556916,0.951217
2,0.0764,0.083049,0.573749,0.958177
3,0.059,0.086018,0.5877,0.95696



Trainer successfully initialized. Ready to fine-tune.


In [None]:
# --- ✅ Save the fine-tuned model and tokenizer ---
save_path = "./emotion_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\nModel and tokenizer saved to {save_path}")


Model and tokenizer saved to ./emotion_model


In [None]:
# Assuming 'trainer.train()' has been run and the best model is loaded
# For demonstration, we'll use a sample text

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

sample_journal_entry = "I finished the project today! I'm so proud of myself and excited for what's next, but I'm also relieved the stress is over."

# --- 5.1 Tokenize Input ---
test_input = tokenizer(
    sample_journal_entry,
    truncation=True,
    padding="max_length",
    max_length=MAX_LENGTH,
    return_tensors="pt"
).to(device)

# --- 5.2 Generate Prediction (Logits) ---
model.eval() # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**test_input)
    # The output is a tensor of 28 raw scores (logits)
    logits = outputs.logits

# --- 5.3 Convert Logits to Probabilities (Percentages) ---
# Apply the Sigmoid function to get probabilities [0, 1]
probabilities = torch.sigmoid(logits).squeeze().cpu().numpy()

# --- 5.4 Map Percentages to Emotion Labels ---
results = {}
for i, emotion in id2label.items():
    # Convert probability (e.g., 0.95) to percentage (e.g., 95.00%)
    percentage = probabilities[i] * 100
    results[emotion] = percentage

# Sort and print the top 5 predicted emotions/percentages
sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)

print("\n--- Emotion Detection Results (Percentages) ---")
for emotion, percent in sorted_results:
    print(f"**{emotion.capitalize()}**: {percent:.2f}%")

print("\n--- Full Multi-Label Prediction Vector ---")
print(f"Probabilities (shape {probabilities.shape}):\n{probabilities}")


--- Emotion Detection Results (Percentages) ---
**Joy**: 68.86%
**Admiration**: 24.68%
**Gratitude**: 21.68%
**Excitement**: 21.11%
**Pride**: 12.34%
**Relief**: 5.98%
**Approval**: 3.54%
**Caring**: 2.23%
**Optimism**: 2.23%
**Realization**: 1.95%
**Surprise**: 1.70%
**Annoyance**: 1.44%
**Desire**: 1.42%
**Neutral**: 1.20%
**Disappointment**: 1.16%
**Amusement**: 0.95%
**Love**: 0.87%
**Remorse**: 0.69%
**Embarrassment**: 0.60%
**Sadness**: 0.48%
**Disapproval**: 0.48%
**Anger**: 0.47%
**Grief**: 0.44%
**Curiosity**: 0.43%
**Disgust**: 0.32%
**Confusion**: 0.17%
**Nervousness**: 0.15%
**Fear**: 0.10%

--- Full Multi-Label Prediction Vector ---
Probabilities (shape (28,)):
[0.24680649 0.00950815 0.00472121 0.01435317 0.03540361 0.02234126
 0.00172118 0.00425803 0.01420947 0.01156068 0.00475852 0.0031855
 0.00601468 0.21113291 0.0009799  0.21681397 0.00440199 0.6885843
 0.00868725 0.00150388 0.02232631 0.12335347 0.01953496 0.05979723
 0.00692437 0.00482019 0.01701226 0.01202266]


In [None]:
!zip -r emotion_model.zip ./emotion_model

  adding: emotion_model/ (stored 0%)
  adding: emotion_model/tokenizer.json (deflated 71%)
  adding: emotion_model/config.json (deflated 60%)
  adding: emotion_model/vocab.txt (deflated 53%)
  adding: emotion_model/model.safetensors (deflated 8%)
  adding: emotion_model/tokenizer_config.json (deflated 75%)
  adding: emotion_model/special_tokens_map.json (deflated 42%)
