<a href="https://colab.research.google.com/github/MrAkash03/Emotion-Detection/blob/main/Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================
# 1. SETUP ENVIRONMENT
# =========================

# Install required libraries
!pip install transformers datasets scikit-learn torch torchvision torchaudio transformers -q

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import LabelEncoder
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [None]:
# =========================
# 2. LOAD DATASET
# =========================
dataset = load_dataset("go_emotions")

train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

print("Training size:", len(train_data))
print("Validation size:", len(val_data))
print("Test size:", len(test_data))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Training size: 43410
Validation size: 5426
Test size: 5427


In [None]:
# =========================
# 3. DATA PREPROCESSING
# =========================

# 3.1 Convert Hugging Face dataset to pandas DataFrame
def dataset_to_df(ds):
    texts = ds['text']
    labels = ds['labels']  # 27 emotion indices
    return pd.DataFrame({'text': texts, 'labels': labels})

train_df = dataset_to_df(train_data)
val_df = dataset_to_df(val_data)
test_df = dataset_to_df(test_data)

# 3.2 Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)             # Remove URLs
    text = re.sub(r"@\w+", "", text)                # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", "", text)        # Remove special characters and numbers
    text = re.sub(r"\s+", " ", text).strip()       # Remove extra spaces
    return text

train_df['text'] = train_df['text'].apply(clean_text)
val_df['text'] = val_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# 3.3 Map 27 GoEmotions labels → 10 emotions + neutral
# Example mapping (can be adjusted):
label_mapping = {
    0: 'admiration',      1: 'amusement',      2: 'anger',
    3: 'annoyance',       4: 'approval',       5: 'caring',
    6: 'confusion',       7: 'curiosity',      8: 'desire',
    9: 'disappointment',  10: 'disapproval',   11: 'disgust',
    12: 'embarrassment',  13: 'excitement',    14: 'fear',
    15: 'gratitude',      16: 'grief',         17: 'joy',
    18: 'love',           19: 'nervousness',   20: 'optimism',
    21: 'pride',          22: 'realization',   23: 'relief',
    24: 'remorse',        25: 'sadness',       26: 'surprise',
    27: 'neutral' # Added mapping for label 27
}

# Mapping to 10 emotions + neutral
emotion_map = {
    'anger': 'anger',
    'annoyance': 'anger',
    'disgust': 'disgust',
    'fear': 'fear',
    'joy': 'joy',
    'love': 'love',
    'optimism': 'optimism',
    'disappointment': 'pessimism',
    'disapproval': 'pessimism',
    'remorse': 'pessimism',
    'sadness': 'sadness',
    'surprise': 'surprise',
    # Everything else → neutral
}

def map_emotions(label_indices):
    mapped_labels = []
    for idx in label_indices:
        emo = label_mapping[idx]
        mapped_labels.append(emotion_map.get(emo, 'neutral'))
    # If multiple labels, pick the first (or you can pick random or majority)
    return mapped_labels[0]

train_df['emotion'] = train_df['labels'].apply(map_emotions)
val_df['emotion'] = val_df['labels'].apply(map_emotions)
test_df['emotion'] = test_df['labels'].apply(map_emotions)

# 3.4 Encode labels (string → integers)
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['emotion'])
val_df['label'] = le.transform(val_df['emotion'])
test_df['label'] = le.transform(test_df['emotion'])

print("Emotion classes:", le.classes_)


Emotion classes: ['anger' 'disgust' 'fear' 'joy' 'love' 'neutral' 'optimism' 'pessimism'
 'sadness' 'surprise']


In [None]:
# =========================
# 3. MODEL PREPARATION (PyTorch)
# =========================

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

MAX_LEN = 64

# =========================
# Tokenization Function
# =========================
def tokenize_texts(texts):
    """
    Tokenize text list and return input tensors for BERT.
    """
    return tokenizer(
        list(texts),
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt',           # Return PyTorch tensors
        return_token_type_ids=True     # Include segment IDs
    )

# Example tokenization
example = train_df['text'].iloc[0]
encoded_example = tokenizer(example, max_length=64, padding='max_length', truncation=True, return_tensors='pt')
print("Tokenized example:")
print(encoded_example)

# =========================
# BERT Classification Model
# =========================
class EmotionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(EmotionClassifier, self).__init__()
        self.bert = bert_model
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output   # [batch_size, hidden_size]
        output = self.drop(pooled_output)
        return self.out(output)

# Number of emotion classes (based on preprocessed data)
n_classes = len(le.classes_)
print("Number of emotion classes:", n_classes)

# Initialize model
model = EmotionClassifier(n_classes=n_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"Model loaded on device: {device}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Tokenized example:
{'input_ids': tensor([[ 101, 2026, 8837, 2833, 2003, 2505, 1045, 2134, 2102, 2031, 2000, 5660,
         2870,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
Number of emotion classes: 10
Model loaded on 

In [None]:
# ==========================================
# 5. MODEL TRAINING & EVALUATION (Simple Ver.)
# ==========================================

# ------------------------------------------------
# 5.1 Custom Dataset
# ------------------------------------------------
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# ------------------------------------------------
# 5.2 Tokenize Texts
# ------------------------------------------------
def tokenize_data(texts):
    return tokenizer(
        list(texts),
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_df['text'].tolist())
val_encodings   = tokenize_data(val_df['text'].tolist())
test_encodings  = tokenize_data(test_df['text'].tolist())

train_dataset = EmotionDataset(train_encodings, train_df['label'].tolist())
val_dataset   = EmotionDataset(val_encodings, val_df['label'].tolist())
test_dataset  = EmotionDataset(test_encodings, test_df['label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)
test_loader  = DataLoader(test_dataset, batch_size=16)

# ------------------------------------------------
# 5.3 Model Definition
# ------------------------------------------------
class BertEmotionClassifier(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.fc(self.dropout(pooled_output))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertEmotionClassifier(n_classes).to(device)

# ------------------------------------------------
# 5.4 Optimizer, Loss, Scheduler
# ------------------------------------------------
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
epochs = 5  # keep small for quick runs
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# ------------------------------------------------
# 5.5 Training Loop
# ------------------------------------------------
def train_model(model, data_loader):
    model.train()
    total_loss, total_correct = 0, 0

    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        _, preds = torch.max(outputs, dim=1)
        total_correct += torch.sum(preds == labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(data_loader), total_correct.double() / len(data_loader.dataset)

# ------------------------------------------------
# 5.6 Evaluation Loop
# ------------------------------------------------
def evaluate(model, data_loader):
    model.eval()
    preds, labels_all = [], []
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, pred = torch.max(outputs, dim=1)
            preds.extend(pred.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    acc = accuracy_score(labels_all, preds)
    return total_loss / len(data_loader), acc, preds, labels_all

# ------------------------------------------------
# 5.7 Train + Evaluate
# ------------------------------------------------
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train_loss, train_acc = train_model(model, train_loader)
    print(f"Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f}")

    val_loss, val_acc, _, _ = evaluate(model, val_loader)
    print(f"Val loss: {val_loss:.4f}, Val acc: {val_acc:.4f}")

# ------------------------------------------------
# 5.8 Final Test Evaluation
# ------------------------------------------------
test_loss, test_acc, preds, labels_all = evaluate(model, test_loader)
print("\n===== Test Results =====")
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

print("\nClassification Report:")
print(classification_report(labels_all, preds, target_names=le.classes_))



Epoch 1/5


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Training: 100%|██████████| 2714/2714 [08:13<00:00,  5.50it/s]


Train loss: 0.7946, Train acc: 0.7452


Evaluating: 100%|██████████| 340/340 [00:18<00:00, 18.74it/s]


Val loss: 0.6631, Val acc: 0.7682

Epoch 2/5


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Training: 100%|██████████| 2714/2714 [08:14<00:00,  5.49it/s]


Train loss: 0.5754, Train acc: 0.7973


Evaluating: 100%|██████████| 340/340 [00:18<00:00, 18.66it/s]


Val loss: 0.6596, Val acc: 0.7691

Epoch 3/5


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Training: 100%|██████████| 2714/2714 [08:14<00:00,  5.49it/s]


Train loss: 0.4219, Train acc: 0.8526


Evaluating: 100%|██████████| 340/340 [00:18<00:00, 18.72it/s]


Val loss: 0.7329, Val acc: 0.7578

Epoch 4/5


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Training: 100%|██████████| 2714/2714 [08:14<00:00,  5.49it/s]


Train loss: 0.2785, Train acc: 0.9070


Evaluating: 100%|██████████| 340/340 [00:18<00:00, 18.71it/s]


Val loss: 0.8689, Val acc: 0.7540

Epoch 5/5


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Training: 100%|██████████| 2714/2714 [08:13<00:00,  5.50it/s]


Train loss: 0.1874, Train acc: 0.9399


Evaluating: 100%|██████████| 340/340 [00:18<00:00, 18.79it/s]


Val loss: 0.9636, Val acc: 0.7431


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Evaluating: 100%|██████████| 340/340 [00:18<00:00, 18.81it/s]


===== Test Results =====
Test loss: 0.9713, Test accuracy: 0.7441

Classification Report:
              precision    recall  f1-score   support

       anger       0.51      0.50      0.51       483
     disgust       0.47      0.43      0.45        84
        fear       0.66      0.69      0.68        74
         joy       0.52      0.41      0.45       116
        love       0.67      0.77      0.72       169
     neutral       0.85      0.85      0.85      3788
    optimism       0.51      0.49      0.50       120
   pessimism       0.41      0.42      0.41       393
     sadness       0.49      0.46      0.48       108
    surprise       0.42      0.39      0.41        92

    accuracy                           0.74      5427
   macro avg       0.55      0.54      0.55      5427
weighted avg       0.74      0.74      0.74      5427






In [None]:
# ==========================================
# 6. INFERENCE / PREDICTION (Terminal Version)
# ==========================================
import torch
import torch.nn.functional as F

# Load the trained model if not already in memory
# model.load_state_dict(torch.load("best_model_state.bin"))
model.eval()  # Set model to evaluation mode

def predict_emotion(text, model, tokenizer, max_len=64):
    """
    Predict emotion for a single text input.

    Returns:
        predicted_label: integer
        predicted_emotion: string
        probabilities: softmax probabilities for all classes
    """
    # Tokenize text
    encoding = tokenizer(
        text,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs, dim=1)
        predicted_label = torch.argmax(probs, dim=1).item()

    predicted_emotion = le.classes_[predicted_label]  # Map integer back to emotion string
    return predicted_label, predicted_emotion, probs.cpu().numpy()

# ------------------------------------------------
# 6.1 Terminal Interactive Loop
# ------------------------------------------------

# Add this near the top of your script
emotion_emojis = {
    "anger": "😡",
    "disgust": "🤢",
    "fear": "😨",
    "joy": "😄",
    "love": "❤️",
    "optimism": "😊",
    "pessimism": "😞",
    "sadness": "😢",
    "surprise": "😲",
    "neutral": "😐"
}

def main():
    print("==== Terminal Emotion Detection ====")
    print("Type 'exit' or 'quit' to stop.\n")

    while True:
        text = input("Enter a sentence: ")
        if text.lower() in ['exit', 'quit']:
            print("Exiting... Goodbye!")
            break

        label, emotion, probs = predict_emotion(text, model, tokenizer)
        emoji = emotion_emojis.get(emotion, "")
        print(f"Predicted Emotion: {emotion} {emoji}")
        print(f"Probabilities: {probs}\n")

# ------------------------------------------------
# 6.2 Run interactive loop
# ------------------------------------------------
if __name__ == "__main__":
    main()

==== Terminal Emotion Detection ====
Type 'exit' or 'quit' to stop.

Enter a sentence: I love you
Predicted Emotion: love ❤️
Probabilities: [[4.3229343e-04 3.2940667e-04 6.6256733e-04 4.0809941e-03 9.8865461e-01
  4.5787133e-03 5.0043699e-04 2.5193096e-04 2.7263674e-04 2.3642225e-04]]

Enter a sentence: I am very angry today
Predicted Emotion: anger 😡
Probabilities: [[9.8692352e-01 2.8888348e-03 6.9495646e-04 2.3977448e-04 3.5147980e-04
  5.6186514e-03 6.2625523e-04 1.3296976e-03 1.0852690e-03 2.4162579e-04]]

Enter a sentence: i can't seem to cry
Predicted Emotion: sadness 😢
Probabilities: [[2.7481732e-03 6.6262501e-04 2.9559655e-03 1.8345161e-03 4.4294787e-03
  2.7633328e-02 9.9050731e-04 1.5273359e-02 9.4274116e-01 7.3083123e-04]]

Enter a sentence: hello there, how are you
Predicted Emotion: neutral 😐
Probabilities: [[9.5209318e-05 3.3591645e-05 1.6049018e-04 9.9005573e-04 8.1964146e-04
  9.9586594e-01 6.4895517e-04 1.9757730e-04 1.7526813e-04 1.0132660e-03]]

Enter a sentence: qui

In [None]:
# =========================
# 7. SAVE MODEL
# =========================

# Define the path to save the model state dictionary
model_save_path = "emotion_classifier_model.pth"

# Save the model state dictionary
torch.save(model.state_dict(), model_save_path)

# Optionally, save the tokenizer too
tokenizer.save_pretrained("bert_tokenizer")

print(f"Model state dictionary saved to {model_save_path}")

Model state dictionary saved to emotion_classifier_model.pth
