<a href="https://colab.research.google.com/github/Ishrak-DataScience/EmotionDetection/blob/main/Roberta_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load all CSV files from the specified folder
folder_path = '/content/drive/MyDrive/TUD Master/LLM/track_a'
all_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Combine all CSV files into a single DataFrame
df = pd.concat((pd.read_csv(file) for file in all_files), ignore_index=True)

# Drop rows with all-NA emotion values
df = df.dropna(subset=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'], how='all')

# Drop rows where 'text' is NaN or empty
df = df.dropna(subset=['text'])  # Remove rows where 'text' is NaN
df = df[df['text'].str.strip() != '']  # Remove rows with empty 'text'

# Extract labels and ensure they are valid
labels = df[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']].idxmax(axis=1).factorize()[0]

# Debugging: Ensure labels are valid
num_labels = len(set(labels))  # Dynamically determine number of classes
print(f"Labels range: {labels.min()} to {labels.max()}")
assert labels.min() >= 0 and labels.max() < num_labels, "Labels are out of range!"

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], labels, test_size=0.2, random_state=42
)

# Define a custom dataset
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',  # Use 'max_length' for fixed-length padding
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)  # Ensure dtype=torch.long
        }

# Preprocess data with RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_len = 128

train_dataset = EmotionDataset(train_texts.to_numpy(), train_labels, tokenizer, max_len)
val_dataset = EmotionDataset(val_texts.to_numpy(), val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Validate the DataLoader
for batch in train_loader:
    print("First batch:", batch)
    break

# Load RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return total_loss / len(data_loader)

# Validation loop
def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return classification_report(true_labels, predictions, target_names=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])

# Train the model
epochs = 10
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}")

    val_report = eval_model(model, val_loader, device)
    print(f"Validation Report:\n{val_report}")

# Save the model
torch.save(model.state_dict(), 'emotion_model_roberta.pth')

Labels range: 0 to 4
First batch: {'text': ['They jumped from the couch and aimed for all of the areas with sensitive skin-the underside of my arm, along my belt line, the insides of my thighs.', 'Hambalyo wadankeygiiyow.', 'Waxaaan Cabsi Ka qabaaa CBBB Bil adkaysan la afarsano Sideee Ku Adkaysan dooonanamp kistiii dheeeraaad Ka Ahayd Wale Ummmada badan ooo Stresss Ku dhacay Arki La donaaa Madaxwaynihi Away danta Qaranka Leee Hayaaa  Xogta USER tiii hore Mahan', 'አሁን ቢያለቅሱ ከየት ያምጡት ሊቢያን የመሰለች ሐገር ምእራባውያን ብትንትኗን አውጥተው ሰዉን በተኑት በጣም ያሳዝናል !', 'ኣየ ተጋሩ ዕላማ ዘይብሉ ኩናት ባዕሎም ከፊቶም ብለይቲ ዓዲ ሰላም ኣሎ ኣብ ዓድና ኢና ኢሎም ልቦም ኣውዲቆም ዝደቀሱ ናይ ሰሜን እዚ ሰራዊትን መካናይዝድን ኣጥቂዕኩሞም ቅድሚ እዚ ምምጽኡ ብኽብረት ብዓበይቲ ዓዲን ጳጳሳትን መሻይኽን ኣብ ክልልኩም መጽዮም ኣብ ብርኩኹም ተደፊኦም ኩናት ጽቡቅ ኣይኮነን ግደፉ ኢሎም ለሚኖምኹም ብጽጋብኩም ከኣ ኩናት ባህላዊ ጨወተና ኢዩ ኢልኩም ድማ እንሆ እዚ ኩሉ ካብ 100መንእሰይ ኣጥፊእኩም መወዳእታ', 'Eine Katastrophe. Wir als Menschheit versagen gerade auf der ganzen Linie', 'Hey, ich bin vor kurzem 18 geworden und suche jetzt einen Psychiater in meiner Nähe, dem ich vertra

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Train Loss: 1.1215113219864006
Validation Report:
              precision    recall  f1-score   support

       Anger       0.57      0.50      0.53      1132
        Fear       0.65      0.86      0.74      2623
         Joy       0.60      0.48      0.53       532
     Sadness       0.70      0.12      0.21       524
    Surprise       0.55      0.09      0.16       242

    accuracy                           0.63      5053
   macro avg       0.61      0.41      0.43      5053
weighted avg       0.62      0.63      0.59      5053



KeyboardInterrupt: 

In [8]:
# prompt: pip freeze to roberta_requirements.txt

!pip freeze > roberta_requirements.txt


In [11]:
# prompt: install what's in the file roberta_requirements.txt

!pip install -r roberta_requirements.txt


Collecting cudf-cu12@ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.10.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (from -r roberta_requirements.txt (line 60))
  Using cached https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.10.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (24.9 MB)
Collecting en-core-web-sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 (from -r roberta_requirements.txt (line 90))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting libcudf-cu12@ https://pypi.nvidia.com/libcudf-cu12/libcudf_cu12-24.10.1-py3-none-manylinux_2_28_x86_64.whl (from -r roberta_requirements.txt