<a href="https://colab.research.google.com/github/Hassan7838/text-based-emotion-recognition/blob/main/text-based-emotion-recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Text-Based Emotion Recognition Using NLP and Deep Learning

# Essential Libraries
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# PyTorch and Hugging Face Transformers
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

from google.colab import files
uploaded = files.upload()

# --- Data Loading and Preprocessing ---
# Load and Explore Dataset
df = pd.read_csv('Emotions.csv', sep=";", header=None, names=["text", "emotion"])
print("Dataset Shape:", df.shape)
print("\nOriginal Label Distribution:\n", df['emotion'].value_counts(), sep="")

# FIX: Clean the emotion labels by removing tabs and extra whitespace
df['emotion'] = df['emotion'].str.strip()  # Remove leading/trailing whitespace
df['emotion'] = df['emotion'].str.replace('\t', '')  # Remove tab characters
df['emotion'] = df['emotion'].str.replace('\\t', '')  # Remove escaped tabs
df['emotion'] = df['emotion'].str.replace(' +', ' ', regex=True)  # Replace multiple spaces with single space

print("\nCleaned Label Distribution:\n", df['emotion'].value_counts(), sep="")

# Simple Text Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

# Encode Labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['emotion'])
num_classes = len(label_encoder.classes_)
print(f"\nEncoded Classes: {list(zip(label_encoder.classes_, range(num_classes)))}")

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['encoded_label'],
    test_size=0.3, random_state=42, stratify=df['encoded_label']
)

# --- DistilBERT Model Setup ---
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# FIX: Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token
    print(f"Set pad_token to: {tokenizer.pad_token}")

# Tokenize the datasets
def tokenize_data(texts, labels, max_length=128):
    encodings = tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': torch.tensor(labels.values if hasattr(labels, 'values') else labels)
    }

train_encodings = tokenize_data(X_train, y_train)
test_encodings = tokenize_data(X_test, y_test)

# Create PyTorch datasets
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_encodings['labels']
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_encodings['labels']
)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained DistilBERT
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
    id2label={i: label for i, label in enumerate(label_encoder.classes_)},
    label2id={label: i for i, label in enumerate(label_encoder.classes_)}
).to(device)

# FIX: Update model's padding token ID to match tokenizer
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id

# Training setup
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Training loop with progress tracking
print("\n--- Training DistilBERT ---")
model.train()
for epoch in range(3):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Calculate accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        if batch_idx % 10 == 0:
            accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
            print(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}, Acc: {accuracy:.3f}')

    avg_loss = total_loss / len(train_loader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch+1} completed. Avg Loss: {avg_loss:.4f}, Acc: {epoch_accuracy:.3f}')

# --- Evaluation ---
print("\n--- Evaluating on Test Set ---")
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.numpy())

print("Classification Report:")
print(classification_report(all_labels, all_predictions, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(all_labels, all_predictions)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - DistilBERT')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# --- Prediction Function ---
def predict_emotion(text):
    """
    Predict emotion from raw text using the fine-tuned DistilBERT model.
    Returns the predicted emotion and a confidence score.
    """
    model.eval()

    # Clean and tokenize the input text
    cleaned_text = clean_text(text)
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Move to device and get prediction
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.softmax(outputs.logits, dim=-1)
        confidence, predicted_class_id = torch.max(probabilities, dim=-1)

    confidence = confidence.item()
    predicted_class_id = predicted_class_id.item()
    predicted_emotion = label_encoder.inverse_transform([predicted_class_id])[0]

    # Adaptive confidence threshold
    if confidence < 0.3:
        # Get top-2 predictions for mixed emotions
        top_probs, top_indices = torch.topk(probabilities[0], 2)
        top_emotions = label_encoder.inverse_transform(top_indices.cpu().numpy())
        predicted_emotion = f"Unclear/Mixed ({top_emotions[0]}: {top_probs[0].item():.2f}, {top_emotions[1]}: {top_probs[1].item():.2f})"

    return predicted_emotion, confidence

# --- CLI Interface ---
print("\n" + "="*50)
print("Text Emotion Recognition CLI (DistilBERT)")
print("Type 'quit' to exit the program.")
print("="*50)

# Test with some examples first
test_sentences = [
    "I am so happy today!",
    "This is terrible and I'm angry",
    "I feel scared and worried",
    "I love this so much"
]

print("\nTesting with sample sentences:")
for sentence in test_sentences:
    emotion, confidence = predict_emotion(sentence)
    print(f"'{sentence}' -> {emotion} (conf: {confidence:.3f})")

print("\nNow you can try your own sentences:")
while True:
    user_input = input("\nPlease enter a sentence to analyze its emotion: ").strip()

    if user_input.lower() == 'quit':
        print("Goodbye!")
        break
    if not user_input:
        continue

    emotion, confidence = predict_emotion(user_input)
    print(f"\nPredicted Emotion: {emotion}")
    print(f"Confidence: {confidence:.4f}")

Saving EmotionsGo.csv to EmotionsGo.csv
Dataset Shape: (5951, 2)

Original Label Distribution:
emotion
joy             1040
sadness          817
anger            419
love\t\t         416
fear\t\t         401
sadness\t\t      399
anger\t\t        396
fear             322
surprise\t\t     318
joy\t\t          313
love             266
joy\t            165
surprise\t       165
surprise         126
anger\t          104
sadness\t        101
fear\t            99
love\t            84
Name: count, dtype: int64

Cleaned Label Distribution:
emotion
joy         1518
sadness     1317
anger        919
fear         822
love         766
surprise     609
Name: count, dtype: int64

Encoded Classes: [('anger', 0), ('fear', 1), ('joy', 2), ('love', 3), ('sadness', 4), ('surprise', 5)]


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using device: cpu

--- Training DistilBERT ---
Epoch 1, Batch 0, Loss: 1.8008, Acc: 0.000
Epoch 1, Batch 10, Loss: 1.7198, Acc: 0.233
Epoch 1, Batch 20, Loss: 1.7415, Acc: 0.244
Epoch 1, Batch 30, Loss: 1.6487, Acc: 0.264
Epoch 1, Batch 40, Loss: 1.3819, Acc: 0.297
Epoch 1, Batch 50, Loss: 1.2755, Acc: 0.343
Epoch 1, Batch 60, Loss: 1.1366, Acc: 0.399
Epoch 1, Batch 70, Loss: 1.0918, Acc: 0.444
Epoch 1, Batch 80, Loss: 0.9488, Acc: 0.477
Epoch 1, Batch 90, Loss: 0.9684, Acc: 0.508
Epoch 1, Batch 100, Loss: 0.7751, Acc: 0.541
Epoch 1, Batch 110, Loss: 0.9890, Acc: 0.567
Epoch 1, Batch 120, Loss: 0.8656, Acc: 0.587
Epoch 1, Batch 130, Loss: 0.7587, Acc: 0.606
Epoch 1, Batch 140, Loss: 0.2026, Acc: 0.624
Epoch 1, Batch 150, Loss: 0.5000, Acc: 0.639
Epoch 1, Batch 160, Loss: 0.4631, Acc: 0.656
Epoch 1, Batch 170, Loss: 0.3066, Acc: 0.668
Epoch 1, Batch 180, Loss: 0.3794, Acc: 0.681
Epoch 1, Batch 190, Loss: 0.2133, Acc: 0.690
Epoch 1, Batch 200, Loss: 0.5219, Acc: 0.700
Epoch 1, Batch 210

KeyboardInterrupt: 