### Sentiment Analysis Neural Network
##### Multi-class sentiment analysis NN using DistilBERT

In [131]:
# Import all required dependencies
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset

# Importing (TODO) for metrics evaluation
from sklearn.metrics import f1_score, classification_report
import numpy as np
import pandas as pd
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu')
device

device(type='mps')

In [132]:
# Multi-emotion classification
class MultiLabelSentimentClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

In [133]:
# Creating a sentiment dataset using DistilBERT
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

## Training Loop

In [None]:
# Example: texts, labels = [...], [...]
# labels should be one-hot encoded, e.g. [0,0,1,0,0]
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# dataset = SentimentDataset(texts, labels, tokenizer, max_length=128)
# loader = DataLoader(dataset, batch_size=16, shuffle=True)

def train(model, loader, optimizer, criterion, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(loader, desc=f"Epoch {epoch+1}", colour="blue"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(loader):.4f}")

## Inference

In [30]:
def predict(model, tokenizer, sentence, device, threshold=0.5):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            sentence,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()[0]
        return (probs >= threshold).astype(int), probs

#### DATASET INFO
Dataset from Kaggle - [*Emotion Dataset for Emotion Recognition Tasks*](https://www.kaggle.com/datasets/parulpandey/emotion-dataset?select=training.csv)

**Parameters**  
Columns - Text | Label  
Size = 16000

In [None]:
# Defining Emotions
emotions = ["sadness", "joy", "love", "anger", "fear", "UNK"]
num_labels = len(emotions)

In [None]:
# Function to encode integer labels in dataset to one-hot
# encoded labels for training/inferencing
def labels_to_multihot(label, num_classes):
    vec = [0] * num_classes
    vec[int(label)] = 1
    return vec

df = pd.read_csv("../data-prep/Base_data/training.csv")

# Extract 7680 data samples (0.48*16000) for training data set
# This creates exactly 60 batches with batch_size=128
train_df = df.sample(frac=0.48, random_state=42).reset_index(drop=True)

In [None]:
# Convert .csv into texts/labels for
texts = train_df['text'].tolist()
labels = [labels_to_multihot(l, num_labels) for l in df["label"]]

print(f"Number of training samples: {len(texts)}")
idx = 5000
print("Sample text:", texts[idx])
print("Sample  label (one-hot):", labels[idx])

Number of training samples: 7680
Sample text: i feel furious at myself for being so pathetic furious at her for various reasons
Sample  label (one-hot): [0, 0, 0, 1, 0, 0]


### Hyperparameters

In [None]:
# Batch size
batch_size = 128
# Threshold for final inferencing per logit
prob_threshold = 0.5

### Initialization

In [None]:
# Initialize tokenizer, dataset, dataloader
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
dataset = SentimentDataset(texts, labels, tokenizer, max_length=32)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize model, optimizer, loss
model = MultiLabelSentimentClassifier(model_name, num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()


### Training

In [130]:
# Train for 'n' epochs
num_epochs = 10
train(model, loader, optimizer, criterion, device, epochs=num_epochs)

Epoch 1: 100%|[34m██████████[0m| 60/60 [01:12<00:00,  1.21s/it]
Epoch 1: 100%|[34m██████████[0m| 60/60 [01:12<00:00,  1.21s/it]


Epoch 1/10, Loss: 0.4415


Epoch 2: 100%|[34m██████████[0m| 60/60 [01:33<00:00,  1.57s/it]



Epoch 2/10, Loss: 0.4091


Epoch 3: 100%|[34m██████████[0m| 60/60 [01:30<00:00,  1.50s/it]
Epoch 3: 100%|[34m██████████[0m| 60/60 [01:30<00:00,  1.50s/it]


Epoch 3/10, Loss: 0.4078


Epoch 4: 100%|[34m██████████[0m| 60/60 [01:31<00:00,  1.52s/it]
Epoch 4: 100%|[34m██████████[0m| 60/60 [01:31<00:00,  1.52s/it]


Epoch 4/10, Loss: 0.4060


Epoch 5: 100%|[34m██████████[0m| 60/60 [01:28<00:00,  1.47s/it]
Epoch 5: 100%|[34m██████████[0m| 60/60 [01:28<00:00,  1.47s/it]


Epoch 5/10, Loss: 0.3996


Epoch 6: 100%|[34m██████████[0m| 60/60 [01:34<00:00,  1.57s/it]
Epoch 6: 100%|[34m██████████[0m| 60/60 [01:34<00:00,  1.57s/it]


Epoch 6/10, Loss: 0.3830


Epoch 7: 100%|[34m██████████[0m| 60/60 [01:31<00:00,  1.53s/it]
Epoch 7: 100%|[34m██████████[0m| 60/60 [01:31<00:00,  1.53s/it]


Epoch 7/10, Loss: 0.3487


Epoch 8: 100%|[34m██████████[0m| 60/60 [01:13<00:00,  1.22s/it]
Epoch 8: 100%|[34m██████████[0m| 60/60 [01:13<00:00,  1.22s/it]


Epoch 8/10, Loss: 0.2977


Epoch 9: 100%|[34m██████████[0m| 60/60 [01:15<00:00,  1.27s/it]
Epoch 9: 100%|[34m██████████[0m| 60/60 [01:15<00:00,  1.27s/it]


Epoch 9/10, Loss: 0.2442


Epoch 10: 100%|[34m██████████[0m| 60/60 [01:11<00:00,  1.19s/it]

Epoch 10/10, Loss: 0.1978





### Inference Example

In [149]:
# Predict emotions for a new sentence
sentence = "What a lovely day"
pred, probs = predict(model, tokenizer, sentence, device, threshold=prob_threshold)
print("Sentence:", sentence)
print(f"Probabilities > {prob_threshold}: ", pred)
print("Predicted labels:", [emotions[i] for i, v in enumerate(pred) if v])
print("Probabilities:", probs)

Sentence: What a lovely day
Probabilities > 0.5:  [0 0 0 0 1 1]
Predicted labels: ['fear', 'UNK']
Probabilities: [0.4293264  0.38216183 0.4317745  0.44819912 0.5323816  0.6052199 ]


In [None]:
# Randomly select half the data for training
# Set a random seed for reproducibility if desired
train_df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

# Prepare texts and multi-hot labels from the sampled data
texts = train_df['text'].tolist()
labels = [labels_to_multihot(l, len(emotions)) for l in train_df['label']]

print(f"Number of training samples: {len(texts)}")
print("Sample text:", texts[0])
print("Sample label (multi-hot):", labels[0])