In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report, hamming_loss
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# DEVICE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# LOAD DATA
df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Emotion Recognition Ai\Dataset\go_emotions_dataset.csv")  # Adjust path if needed

# FILTER & CLEAN
# Drop unnecessary columns
df = df.drop(columns=["id", "example_very_unclear"])

# Replace [NAME], [RELIGION], etc. with empty strings
df['text'] = df['text'].str.replace(r"\[.*?\]", "", regex=True)

  df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Emotion Recognition Ai\Dataset\go_emotions_dataset.csv")  # Adjust path if needed


Using device: cpu


In [4]:
# Define emotion columns (change this list if your dataset has more)
emotion_labels = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion']

# Drop rows where all emotions are 0 (optional)
df = df[df[emotion_labels].sum(axis=1) > 0].reset_index(drop=True)

# TOKENIZER
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [5]:
class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.FloatTensor(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": label
        }

In [6]:
# DATA PREP
X = df["text"].tolist()
y = df[emotion_labels].values

dataset = GoEmotionsDataset(X, y, tokenizer)

# Train/Val Split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)


In [7]:
# MODEL
class EmotionClassifier(nn.Module):
    def __init__(self, num_labels):
        super(EmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        return self.out(self.dropout(pooled_output))

model = EmotionClassifier(num_labels=len(emotion_labels)).to(device)


In [8]:
import os

# LOSS & OPTIMIZER
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

def save_checkpoint(epoch):
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1}.pt"))

def load_checkpoint(path):
    if os.path.exists(path):
        checkpoint = torch.load(path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Resuming from epoch {checkpoint['epoch']}")
        return checkpoint['epoch']
    return 0

In [9]:
# TRAIN FUNCTION
def train(model, loader, optimizer, criterion, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    save_checkpoint(epoch)
    return total_loss / len(loader)

In [10]:
# EVALUATE FUNCTION
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(loader)

In [11]:
start_epoch = load_checkpoint(os.path.join(checkpoint_dir, "checkpoint_epoch_1.pt")) 

In [13]:
# TRAIN LOOP
num_epochs = 3
for epoch in range(start_epoch, num_epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion, epoch)
    val_loss = evaluate(model, val_dataloader, criterion)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Training Epoch 1:   0%|          | 0/3658 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [1]:
# Final testing function (for predictions on new data)
def predict(model, tokenizer, text):
    model.eval()
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        preds = torch.sigmoid(outputs).cpu().numpy()
        return preds

# Example usage:
print(predict(model, tokenizer, "I love this so much!"))

NameError: name 'model' is not defined