<a href="https://colab.research.google.com/github/INVISIBLE-SAM/SemEval-2025-Task-11---Track-A/blob/main/Final_submission_part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# E5 large Hindi

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/hin.csv')
    test_df = pd.read_csv('/content/hin_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/hindi_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Epoch 1/15: 100%|██████████| 160/160 [03:42<00:00,  1.39s/batch, loss=0.113]


Epoch 1/15, Average Loss: 0.2663
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.8206
New best model saved with Macro F1: 0.8206


Epoch 2/15: 100%|██████████| 160/160 [03:33<00:00,  1.33s/batch, loss=0.0272]


Epoch 2/15, Average Loss: 0.0916
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.8612
New best model saved with Macro F1: 0.8612


Epoch 3/15: 100%|██████████| 160/160 [03:32<00:00,  1.33s/batch, loss=0.063]


Epoch 3/15, Average Loss: 0.0516
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.8620
New best model saved with Macro F1: 0.8620


Epoch 4/15: 100%|██████████| 160/160 [03:33<00:00,  1.33s/batch, loss=0.0257]


Epoch 4/15, Average Loss: 0.0311
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.8489
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 160/160 [03:36<00:00,  1.35s/batch, loss=0.0162]


Epoch 5/15, Average Loss: 0.0200
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.8961
New best model saved with Macro F1: 0.8961


Epoch 6/15: 100%|██████████| 160/160 [03:34<00:00,  1.34s/batch, loss=0.0122]


Epoch 6/15, Average Loss: 0.0116
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.8585
No F1 improvement. Patience: 1/4


Epoch 7/15:  31%|███▏      | 50/160 [01:08<02:31,  1.38s/batch, loss=0.00544]


KeyboardInterrupt: 

## Hindi Prediction

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/hin_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/hindi_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/hindi_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 64/64 [00:25<00:00,  2.55it/s]


Predictions saved to /content/predictions.csv


# Rus test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/rus.csv')
    test_df = pd.read_csv('/content/rus_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/RUS_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 168/168 [03:51<00:00,  1.38s/batch, loss=0.23]


Epoch 1/15, Average Loss: 0.3096
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.8254
New best model saved with Macro F1: 0.8254


Epoch 2/15: 100%|██████████| 168/168 [03:41<00:00,  1.32s/batch, loss=0.0646]


Epoch 2/15, Average Loss: 0.1117
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.8705
New best model saved with Macro F1: 0.8705


Epoch 3/15: 100%|██████████| 168/168 [03:41<00:00,  1.32s/batch, loss=0.0584]


Epoch 3/15, Average Loss: 0.0607
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.8847
New best model saved with Macro F1: 0.8847


Epoch 4/15: 100%|██████████| 168/168 [03:41<00:00,  1.32s/batch, loss=0.0167]


Epoch 4/15, Average Loss: 0.0330
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.8708
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 168/168 [03:41<00:00,  1.32s/batch, loss=0.00989]


Epoch 5/15, Average Loss: 0.0212
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.8727
No F1 improvement. Patience: 2/4


Epoch 6/15: 100%|██████████| 168/168 [03:41<00:00,  1.32s/batch, loss=0.00385]


Epoch 6/15, Average Loss: 0.0121
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.8738
No F1 improvement. Patience: 3/4


Epoch 7/15:   3%|▎         | 5/168 [00:07<04:17,  1.58s/batch, loss=0.00809]


KeyboardInterrupt: 

## rus pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/rus_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/RUS_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_rus.csv', index=False)
    print("Predictions saved to /content/predictions.csv")

if __name__ == "__main__":
    predict_and_save()

  model.load_state_dict(torch.load('/content/RUS_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 63/63 [00:26<00:00,  2.42it/s]


Predictions saved to /content/predictions.csv


# German test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/deu.csv')
    test_df = pd.read_csv('/content/deu_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/DEU_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 163/163 [03:42<00:00,  1.37s/batch, loss=0.361]


Epoch 1/15, Average Loss: 0.4001
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.4226
New best model saved with Macro F1: 0.4226


Epoch 2/15: 100%|██████████| 163/163 [03:40<00:00,  1.35s/batch, loss=0.311]


Epoch 2/15, Average Loss: 0.2573
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.4319
New best model saved with Macro F1: 0.4319


Epoch 3/15: 100%|██████████| 163/163 [03:41<00:00,  1.36s/batch, loss=0.0907]


Epoch 3/15, Average Loss: 0.1652
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.6098
New best model saved with Macro F1: 0.6098


Epoch 4/15: 100%|██████████| 163/163 [03:41<00:00,  1.36s/batch, loss=0.0518]


Epoch 4/15, Average Loss: 0.0874
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.6434
New best model saved with Macro F1: 0.6434


Epoch 5/15: 100%|██████████| 163/163 [03:40<00:00,  1.35s/batch, loss=0.0372]


Epoch 5/15, Average Loss: 0.0437
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.5889
No F1 improvement. Patience: 1/4


Epoch 6/15: 100%|██████████| 163/163 [03:40<00:00,  1.35s/batch, loss=0.0132]


Epoch 6/15, Average Loss: 0.0189
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.6282
No F1 improvement. Patience: 2/4


Epoch 7/15:  45%|████▍     | 73/163 [01:39<02:02,  1.36s/batch, loss=0.00842]


KeyboardInterrupt: 

## german pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/deu_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/DEU_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_rus.csv', index=False)
    print("Predictions saved to /content/predictions_deu.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/DEU_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 163/163 [01:09<00:00,  2.33it/s]


Predictions saved to /content/predictions_deu.csv


# Amharic

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/amh.csv')
    test_df = pd.read_csv('/content/amh_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/AMH_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 222/222 [05:27<00:00,  1.47s/batch, loss=0.312]


Epoch 1/15, Average Loss: 0.3846
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.4637
New best model saved with Macro F1: 0.4637


Epoch 2/15: 100%|██████████| 222/222 [05:19<00:00,  1.44s/batch, loss=0.253]


Epoch 2/15, Average Loss: 0.2568
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.5785
New best model saved with Macro F1: 0.5785


Epoch 3/15: 100%|██████████| 222/222 [05:19<00:00,  1.44s/batch, loss=0.199]


Epoch 3/15, Average Loss: 0.1818
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.6449
New best model saved with Macro F1: 0.6449


Epoch 4/15: 100%|██████████| 222/222 [05:19<00:00,  1.44s/batch, loss=0.0413]


Epoch 4/15, Average Loss: 0.1095
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.6446
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 222/222 [05:19<00:00,  1.44s/batch, loss=0.0316]


Epoch 5/15, Average Loss: 0.0607
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.6583
New best model saved with Macro F1: 0.6583


Epoch 6/15: 100%|██████████| 222/222 [05:19<00:00,  1.44s/batch, loss=0.0335]


Epoch 6/15, Average Loss: 0.0367
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.6844
New best model saved with Macro F1: 0.6844


Epoch 7/15: 100%|██████████| 222/222 [05:19<00:00,  1.44s/batch, loss=0.0181]


Epoch 7/15, Average Loss: 0.0182
Evaluating after Epoch 7...
Macro F1-Score after Epoch 7: 0.6855
New best model saved with Macro F1: 0.6855


Epoch 8/15: 100%|██████████| 222/222 [05:19<00:00,  1.44s/batch, loss=0.0201]


Epoch 8/15, Average Loss: 0.0086
Evaluating after Epoch 8...
Macro F1-Score after Epoch 8: 0.6576
No F1 improvement. Patience: 1/4


Epoch 9/15:  16%|█▌        | 36/222 [00:52<04:31,  1.46s/batch, loss=0.00333]


KeyboardInterrupt: 

## Amharic pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/amh_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/AMH_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_amh.csv', index=False)
    print("Predictions saved to /content/predictions_amh.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/AMH_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 111/111 [00:51<00:00,  2.16it/s]


Predictions saved to /content/predictions_amh.csv


# chinese test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/chn.csv')
    test_df = pd.read_csv('/content/chn_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/CHN_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/15: 100%|██████████| 166/166 [03:53<00:00,  1.40s/batch, loss=0.277]


Epoch 1/15, Average Loss: 0.3337
Evaluating after Epoch 1...
Macro F1-Score after Epoch 1: 0.3345
New best model saved with Macro F1: 0.3345


Epoch 2/15: 100%|██████████| 166/166 [03:53<00:00,  1.41s/batch, loss=0.112]


Epoch 2/15, Average Loss: 0.2095
Evaluating after Epoch 2...
Macro F1-Score after Epoch 2: 0.4258
New best model saved with Macro F1: 0.4258


Epoch 3/15: 100%|██████████| 166/166 [03:57<00:00,  1.43s/batch, loss=0.116]


Epoch 3/15, Average Loss: 0.1389
Evaluating after Epoch 3...
Macro F1-Score after Epoch 3: 0.6119
New best model saved with Macro F1: 0.6119


Epoch 4/15: 100%|██████████| 166/166 [03:56<00:00,  1.43s/batch, loss=0.0296]


Epoch 4/15, Average Loss: 0.0790
Evaluating after Epoch 4...
Macro F1-Score after Epoch 4: 0.5607
No F1 improvement. Patience: 1/4


Epoch 5/15: 100%|██████████| 166/166 [03:56<00:00,  1.42s/batch, loss=0.0269]


Epoch 5/15, Average Loss: 0.0436
Evaluating after Epoch 5...
Macro F1-Score after Epoch 5: 0.6093
No F1 improvement. Patience: 2/4


Epoch 6/15: 100%|██████████| 166/166 [03:56<00:00,  1.42s/batch, loss=0.00665]


Epoch 6/15, Average Loss: 0.0176
Evaluating after Epoch 6...
Macro F1-Score after Epoch 6: 0.6212
New best model saved with Macro F1: 0.6212


Epoch 7/15: 100%|██████████| 166/166 [03:56<00:00,  1.43s/batch, loss=0.0157]


Epoch 7/15, Average Loss: 0.0106
Evaluating after Epoch 7...
Macro F1-Score after Epoch 7: 0.6064
No F1 improvement. Patience: 1/4


Epoch 8/15: 100%|██████████| 166/166 [03:56<00:00,  1.43s/batch, loss=0.00545]


Epoch 8/15, Average Loss: 0.0069
Evaluating after Epoch 8...
Macro F1-Score after Epoch 8: 0.6014
No F1 improvement. Patience: 2/4


Epoch 9/15:  11%|█         | 18/166 [00:27<03:43,  1.51s/batch, loss=0.014]


KeyboardInterrupt: 

## chinese pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/chn_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/CHN_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/predictions_chn.csv', index=False)
    print("Predictions saved to /content/predictions_chn.csv")

if __name__ == "__main__":
    predict_and_save()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  model.load_state_dict(torch.load('/content/CHN_emotion_model_best.pth'))
Making Predictions: 100%|██████████| 166/166 [01:16<00:00,  2.16it/s]


Predictions saved to /content/predictions_chn.csv


# spanish test

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Custom Dataset class for Hindi emotion data
class HindiEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)  # Adjusted for 1024-dimensional embeddings
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def train_model(model, train_loader, optimizer, device, test_loader, num_epochs=15, patience=3, save_path="/content/hindi_emotion_model_best.pth"):
    criterion = nn.BCELoss()
    model.train()

    best_f1 = 0.0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Evaluate model on test dataset after each epoch
        print(f"Evaluating after Epoch {epoch + 1}...")
        macro_f1 = evaluate_model(model, test_loader, device)
        print(f"Macro F1-Score after Epoch {epoch + 1}: {macro_f1:.4f}")

        # Check for improvement and early stopping
        if macro_f1 > best_f1:
            best_f1 = macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with Macro F1: {best_f1:.4f}")
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            print(f"No F1 improvement. Patience: {epochs_without_improvement}/{patience}")
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    print(f"Training complete. Best Macro F1: {best_f1:.4f}")

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(outputs.cpu().numpy())

    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    binary_predictions = (predictions > 0.5).astype(int)

    macro_f1 = f1_score(true_labels, binary_predictions, average='macro')
    return macro_f1

def main():
    train_df = pd.read_csv('/content/esp.csv')
    test_df = pd.read_csv('/content/esp_dev.csv')

    train_texts = train_df['text'].tolist()
    train_labels = train_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    test_texts = test_df['text'].tolist()
    test_labels = test_df[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].values

    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_classes = 6

    train_dataset = HindiEmotionDataset(train_texts, train_labels, tokenizer)
    test_dataset = HindiEmotionDataset(test_texts, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = HindiEmotionModel(model_name, num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    train_model(
        model,
        train_loader,
        optimizer,
        device,
        test_loader,
        num_epochs=15,
        patience=4,
        save_path="/content/ESP_emotion_model_best.pth"
    )

if __name__ == "__main__":
    main()

## Spanish pred

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class HindiEmotionPredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=150):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

class HindiEmotionModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HindiEmotionModel, self).__init__()
        self.embedding_model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, num_classes)
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        logits = self.fc(x)
        return torch.sigmoid(logits)

def predict_and_save():
    # Load prediction data
    pred_df = pd.read_csv('/content/esp_pred.csv')
    ids = pred_df['id'].tolist()
    texts = pred_df['text'].tolist()

    # Model configuration
    model_name = "intfloat/multilingual-e5-large-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and load weights
    model = HindiEmotionModel(model_name, num_classes=6).to(device)
    model.load_state_dict(torch.load('/content/ESP_emotion_model_best.pth'))
    model.eval()

    # Create dataset and dataloader
    dataset = HindiEmotionPredictionDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    # Make predictions
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Making Predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            batch_preds = (outputs.cpu().numpy() > 0.5).astype(int)
            predictions.extend(batch_preds)

    # Create output DataFrame with original order and proper column names
    output_df = pd.DataFrame(predictions, columns=[
        'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'
    ])

    # Add ID column and reorder
    output_df.insert(0, 'id', ids)
    output_df = output_df[['id', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']]

    # Save predictions
    output_df.to_csv('/content/pred_esp.csv', index=False)
    print("Predictions saved to /content/predictions_chn.csv")

if __name__ == "__main__":
    predict_and_save()