<a href="https://colab.research.google.com/github/INVISIBLE-SAM/SemEval-2025-Task-11---Track-A/blob/main/Final_submission_Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# E5 English

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/eng.csv')
    test_df = pd.read_csv('/content/eng_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 5

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/English_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 173/173 [04:07<00:00,  1.43s/batch, loss=0.33]


Epoch 1/15, Average Loss: 0.4490
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.7311
New best model saved with Macro F1: 0.7311


Epoch 2/15: 100%|██████████| 173/173 [04:03<00:00,  1.41s/batch, loss=0.211]


Epoch 2/15, Average Loss: 0.2705
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.7002
No F1 improvement. Patience: 1/4


Epoch 3/15: 100%|██████████| 173/173 [04:02<00:00,  1.40s/batch, loss=0.0889]


Epoch 3/15, Average Loss: 0.1460
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.7551
New best model saved with Macro F1: 0.7551


Epoch 4/15: 100%|██████████| 173/173 [04:03<00:00,  1.41s/batch, loss=0.123]


Epoch 4/15, Average Loss: 0.0750
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.7446
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 173/173 [04:03<00:00,  1.41s/batch, loss=0.039]


Epoch 5/15, Average Loss: 0.0423
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.7241
No F1 improvement. Patience: 2/4


Epoch 6/15: 100%|██████████| 173/173 [04:02<00:00,  1.40s/batch, loss=0.00815]


Epoch 6/15, Average Loss: 0.0192
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.7660
New best model saved with Macro F1: 0.7660


Epoch 7/15: 100%|██████████| 173/173 [04:03<00:00,  1.41s/batch, loss=0.00874]


Epoch 7/15, Average Loss: 0.0096
Evaluating after Epoch 7...
Macro F1-Score after Epoch 7: 0.7525
No F1 improvement. Patience: 1/4


Epoch 8/15: 100%|██████████| 173/173 [04:02<00:00,  1.40s/batch, loss=0.00509]


Epoch 8/15, Average Loss: 0.0059
Evaluating after Epoch 8...
Macro F1-Score after Epoch 8: 0.7447
No F1 improvement. Patience: 2/4


Epoch 9/15: 100%|██████████| 173/173 [04:02<00:00,  1.40s/batch, loss=0.00332]


Epoch 9/15, Average Loss: 0.0033
Evaluating after Epoch 9...
Macro F1-Score after Epoch 9: 0.7466
No F1 improvement. Patience: 3/4


Epoch 10/15:  12%|█▏        | 21/173 [00:30<03:42,  1.47s/batch, loss=0.00186]


KeyboardInterrupt: 

## E5 eng pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/eng_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=5).to(device)
    model.load_state_dict(torch.load('/content/English_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions.csv', index=False)
    print("Predictions saved to /content/predictions_eng.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/English_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 173/173 [01:18<00:00,  2.20it/s]


Predictions saved to /content/predictions_eng.csv


# marathi test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/mar.csv')
    test_df = pd.read_csv('/content/mar_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/MAR_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 151/151 [03:33<00:00,  1.42s/batch, loss=0.149]


Epoch 1/15, Average Loss: 0.3074
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.8941
New best model saved with Macro F1: 0.8941


Epoch 2/15: 100%|██████████| 151/151 [03:24<00:00,  1.35s/batch, loss=0.13]


Epoch 2/15, Average Loss: 0.1030
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.9301
New best model saved with Macro F1: 0.9301


Epoch 3/15: 100%|██████████| 151/151 [03:24<00:00,  1.35s/batch, loss=0.0305]


Epoch 3/15, Average Loss: 0.0614
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.9134
No F1 improvement. Patience: 1/4


Epoch 4/15: 100%|██████████| 151/151 [03:23<00:00,  1.35s/batch, loss=0.0157]


Epoch 4/15, Average Loss: 0.0399
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.9523
New best model saved with Macro F1: 0.9523


Epoch 5/15: 100%|██████████| 151/151 [03:24<00:00,  1.35s/batch, loss=0.0572]


Epoch 5/15, Average Loss: 0.0239
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.9714
New best model saved with Macro F1: 0.9714


Epoch 6/15: 100%|██████████| 151/151 [03:24<00:00,  1.35s/batch, loss=0.0461]


Epoch 6/15, Average Loss: 0.0177
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.9376
No F1 improvement. Patience: 1/4


Epoch 7/15:   5%|▍         | 7/151 [00:10<03:42,  1.54s/batch, loss=0.00951]


KeyboardInterrupt: 

## marathi pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/mar_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/MAR_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_mar.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/MAR_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 63/63 [00:26<00:00,  2.34it/s]


Predictions saved to /content/predictions.csv


# arabic


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/ary.csv')
    test_df = pd.read_csv('/content/ary_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/ARY_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 101/101 [02:18<00:00,  1.37s/batch, loss=0.235]


Epoch 1/15, Average Loss: 0.3875
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.1406
New best model saved with Macro F1: 0.1406


Epoch 2/15: 100%|██████████| 101/101 [02:14<00:00,  1.33s/batch, loss=0.226]


Epoch 2/15, Average Loss: 0.2804
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.3123
New best model saved with Macro F1: 0.3123


Epoch 3/15: 100%|██████████| 101/101 [02:14<00:00,  1.33s/batch, loss=0.173]


Epoch 3/15, Average Loss: 0.1864
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.5474
New best model saved with Macro F1: 0.5474


Epoch 4/15: 100%|██████████| 101/101 [02:14<00:00,  1.33s/batch, loss=0.0734]


Epoch 4/15, Average Loss: 0.1029
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.5155
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 101/101 [02:13<00:00,  1.33s/batch, loss=0.119]


Epoch 5/15, Average Loss: 0.0545
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.5197
No F1 improvement. Patience: 2/4


Epoch 6/15: 100%|██████████| 101/101 [02:13<00:00,  1.33s/batch, loss=0.0133]


Epoch 6/15, Average Loss: 0.0312
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.4952
No F1 improvement. Patience: 3/4


Epoch 7/15:   5%|▍         | 5/101 [00:06<02:07,  1.33s/batch, loss=0.0153]


KeyboardInterrupt: 

## arabic pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/ary_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/ARY_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_ary.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/ARY_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 51/51 [00:20<00:00,  2.45it/s]


Predictions saved to /content/predictions.csv


# swahli

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/swa.csv')
    test_df = pd.read_csv('/content/swa_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/SWA_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 207/207 [04:42<00:00,  1.36s/batch, loss=0.317]


Epoch 1/15, Average Loss: 0.3263
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.0000
No F1 improvement. Patience: 1/4


Epoch 2/15: 100%|██████████| 207/207 [04:32<00:00,  1.32s/batch, loss=0.349]


Epoch 2/15, Average Loss: 0.2933
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.0421


KeyboardInterrupt: 

## swahli pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/swa_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/SWA_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_swa.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

# ukr test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/ukr.csv')
    test_df = pd.read_csv('/content/ukr_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/UKR_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 155/155 [03:43<00:00,  1.44s/batch, loss=0.243]


Epoch 1/15, Average Loss: 0.2643
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.3544
New best model saved with Macro F1: 0.3544


Epoch 2/15: 100%|██████████| 155/155 [03:38<00:00,  1.41s/batch, loss=0.129]


Epoch 2/15, Average Loss: 0.1467
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.4424
New best model saved with Macro F1: 0.4424


Epoch 3/15: 100%|██████████| 155/155 [03:38<00:00,  1.41s/batch, loss=0.019]


Epoch 3/15, Average Loss: 0.0837
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.5615
New best model saved with Macro F1: 0.5615


Epoch 4/15: 100%|██████████| 155/155 [03:37<00:00,  1.40s/batch, loss=0.0137]


Epoch 4/15, Average Loss: 0.0397
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.5340
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 155/155 [03:37<00:00,  1.41s/batch, loss=0.0148]


Epoch 5/15, Average Loss: 0.0151
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.5596
No F1 improvement. Patience: 2/4


Epoch 6/15: 100%|██████████| 155/155 [03:37<00:00,  1.41s/batch, loss=0.0221]


Epoch 6/15, Average Loss: 0.0099
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.5599
No F1 improvement. Patience: 3/4


Epoch 7/15:   4%|▍         | 6/155 [00:09<03:45,  1.52s/batch, loss=0.00585]


KeyboardInterrupt: 

## ukr pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/ukr_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/UKR_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_ukr.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/UKR_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 140/140 [01:04<00:00,  2.17it/s]


Predictions saved to /content/predictions.csv


# RON test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/ron.csv')
    test_df = pd.read_csv('/content/ron_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/RON_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 78/78 [01:49<00:00,  1.41s/batch, loss=0.382]


Epoch 1/15, Average Loss: 0.4730
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.5430
New best model saved with Macro F1: 0.5430


Epoch 2/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.303]


Epoch 2/15, Average Loss: 0.3138
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.6908
New best model saved with Macro F1: 0.6908


Epoch 3/15: 100%|██████████| 78/78 [01:49<00:00,  1.40s/batch, loss=0.132]


Epoch 3/15, Average Loss: 0.2183
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.7676
New best model saved with Macro F1: 0.7676


Epoch 4/15: 100%|██████████| 78/78 [01:49<00:00,  1.40s/batch, loss=0.164]


Epoch 4/15, Average Loss: 0.1601
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.7647
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.184]


Epoch 5/15, Average Loss: 0.1048
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.7599
No F1 improvement. Patience: 2/4


Epoch 6/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.091]


Epoch 6/15, Average Loss: 0.0602
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.7599
No F1 improvement. Patience: 3/4


Epoch 7/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.0317]


Epoch 7/15, Average Loss: 0.0310
Evaluating after Epoch 7...
Macro F1-Score after Epoch 7: 0.7696
New best model saved with Macro F1: 0.7696


Epoch 8/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.0106]


Epoch 8/15, Average Loss: 0.0187
Evaluating after Epoch 8...
Macro F1-Score after Epoch 8: 0.7793
New best model saved with Macro F1: 0.7793


Epoch 9/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.0281]


Epoch 9/15, Average Loss: 0.0106
Evaluating after Epoch 9...
Macro F1-Score after Epoch 9: 0.7722
No F1 improvement. Patience: 1/4


Epoch 10/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.00865]


Epoch 10/15, Average Loss: 0.0083
Evaluating after Epoch 10...
Macro F1-Score after Epoch 10: 0.7813
New best model saved with Macro F1: 0.7813


Epoch 11/15: 100%|██████████| 78/78 [01:48<00:00,  1.39s/batch, loss=0.00489]


Epoch 11/15, Average Loss: 0.0060
Evaluating after Epoch 11...
Macro F1-Score after Epoch 11: 0.7768
No F1 improvement. Patience: 1/4


Epoch 12/15:   5%|▌         | 4/78 [00:07<02:09,  1.75s/batch, loss=0.00397]


KeyboardInterrupt: 

## Ron pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/ron_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/RON_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/pred_ron.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/RON_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 70/70 [00:32<00:00,  2.15it/s]


Predictions saved to /content/predictions.csv


# Hausa test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/hau.csv')
    test_df = pd.read_csv('/content/hau_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/HAU_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 135/135 [03:02<00:00,  1.35s/batch, loss=0.201]


Epoch 1/15, Average Loss: 0.4348
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.4641
New best model saved with Macro F1: 0.4641


Epoch 2/15: 100%|██████████| 135/135 [03:02<00:00,  1.35s/batch, loss=0.252]


Epoch 2/15, Average Loss: 0.2860
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.6260
New best model saved with Macro F1: 0.6260


Epoch 3/15: 100%|██████████| 135/135 [03:01<00:00,  1.34s/batch, loss=0.0547]


Epoch 3/15, Average Loss: 0.1847
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.6220
No F1 improvement. Patience: 1/4


Epoch 4/15: 100%|██████████| 135/135 [03:04<00:00,  1.37s/batch, loss=0.659]


Epoch 4/15, Average Loss: 0.1172
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.6733
New best model saved with Macro F1: 0.6733


Epoch 5/15: 100%|██████████| 135/135 [03:01<00:00,  1.34s/batch, loss=0.0652]


Epoch 5/15, Average Loss: 0.0723
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.7047
New best model saved with Macro F1: 0.7047


Epoch 6/15: 100%|██████████| 135/135 [02:59<00:00,  1.33s/batch, loss=0.247]


Epoch 6/15, Average Loss: 0.0374
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.6774
No F1 improvement. Patience: 1/4


Epoch 7/15: 100%|██████████| 135/135 [03:04<00:00,  1.37s/batch, loss=0.00356]


Epoch 7/15, Average Loss: 0.0292
Evaluating after Epoch 7...
Macro F1-Score after Epoch 7: 0.6877
No F1 improvement. Patience: 2/4


Epoch 8/15: 100%|██████████| 135/135 [03:05<00:00,  1.37s/batch, loss=0.0152]


Epoch 8/15, Average Loss: 0.0132
Evaluating after Epoch 8...
Macro F1-Score after Epoch 8: 0.6968
No F1 improvement. Patience: 3/4


Epoch 9/15:  10%|▉         | 13/135 [00:19<03:01,  1.48s/batch, loss=0.00667]


KeyboardInterrupt: 

## Hausa pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/hau_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/HAU_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/pred_hau.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/HAU_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 68/68 [00:27<00:00,  2.48it/s]


Predictions saved to /content/predictions.csv
