<a href="https://colab.research.google.com/github/LoukasSekoulidis/ml-exer/blob/main/sentiment-analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
from datasets import load_dataset
from transformers import AutoTokenizer, AdamW, BertForSequenceClassification, BertConfig
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(filename='training.log', level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
data = load_dataset('dair-ai/emotion')
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=6)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Load data
train_data = data['train']
validation_data = data['validation']
test_data = data['test']

# Tokenize data
def tokenization(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

train_data = train_data.map(tokenization, batched=True)
validation_data = validation_data.map(tokenization, batched=True)
test_data = test_data.map(tokenization, batched=True)

# Format data
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Set batch size and data loader
batch_size = 64
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Load data
train_data = data['train']
validation_data = data['validation']
test_data = data['test']

# Tokenize data
def tokenization(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

train_data = train_data.map(tokenization, batched=True)
validation_data = validation_data.map(tokenization, batched=True)
test_data = test_data.map(tokenization, batched=True)

# Format data
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Set batch size and data loader
batch_size = 64
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

In [None]:
# Training function
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

def train_model(model, train_dataloader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        correct_predictions = 0
        total_predictions = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            with autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            epoch_loss += loss.item()
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)

            progress_bar.set_postfix(loss=epoch_loss/(total_predictions//batch_size),
                                     accuracy=correct_predictions.item()/total_predictions)
            logger.info(f"Epoch {epoch+1}, Batch {total_predictions//batch_size}, Loss: {epoch_loss/(total_predictions//batch_size):.4f}, Accuracy: {correct_predictions.item()/total_predictions:.4f}")

        avg_loss = epoch_loss / len(train_dataloader)
        avg_accuracy = correct_predictions.item() / total_predictions
        logger.info(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}, Average Accuracy: {avg_accuracy:.4f}")

train_model(model, train_dataloader, optimizer, epochs=5)

Epoch 1/5: 100%|██████████| 250/250 [01:24<00:00,  2.95it/s, accuracy=0.745, loss=0.735]
Epoch 2/5: 100%|██████████| 250/250 [01:24<00:00,  2.97it/s, accuracy=0.934, loss=0.175]
Epoch 3/5: 100%|██████████| 250/250 [01:23<00:00,  2.99it/s, accuracy=0.95, loss=0.116]
Epoch 4/5: 100%|██████████| 250/250 [01:23<00:00,  3.00it/s, accuracy=0.956, loss=0.0937]
Epoch 5/5:  41%|████      | 102/250 [00:34<00:48,  3.05it/s, accuracy=0.969, loss=0.071]

In [None]:
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = correct_predictions.item() / total_predictions
    return avg_loss, avg_accuracy

def test_model(model, test_dataloader):
    print("Testing model...")
    test_loss, test_accuracy = evaluate_model(model, test_dataloader)
    print(f"Test - Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}")

# Evaluate on test data
test_model(model, test_dataloader)

In [38]:
def predict_emotion(input_text):
    # Tokenize the input text
    tokens = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

    # Move tokens to the appropriate device
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    # Disable gradient calculation
    with torch.no_grad():
        # Get model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Get the logits and apply softmax to get probabilities
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)

    # Get the predicted label
    predicted_label = torch.argmax(probabilities, dim=1).item()

    # Get the confidence score
    confidence = torch.max(probabilities).item()

    # Map the label to the corresponding emotion
    label_to_emotion = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
    emotion = label_to_emotion[predicted_label]

    return emotion, confidence

# Test the function with a custom string
test_string = "i wan't to have sex with you"
predicted_emotion, confidence = predict_emotion(test_string)
print(f"Input Text: {test_string}")
print(f"Predicted Emotion: {predicted_emotion} (Confidence: {confidence:.2f})")

Input Text: i wan't to have sex with you
Predicted Emotion: love (Confidence: 0.22)
