In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import math
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.amp.autocast_mode import autocast
from torch.amp.grad_scaler import GradScaler
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
roberta_base = AutoModel.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("using device: ", device)

using device: cuda


In [None]:
roberta_base.pooler = None
roberta_base.gradient_checkpointing_enable()

In [5]:
roberta_base.config.hidden_size

768

In [None]:
train_path = "/content/drive/MyDrive/Go-Emotions-Train.csv"
val_path = "/content/drive/MyDrive/Go-Emotions-Validation.csv"

df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)

In [7]:
class GoEmotions_Dataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = 128
        self.target_cols = [str(i) for i in range(28)]

    def __len__(self):
        return(len(self.data))

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        text = str(item.text)
        encoding = self.tokenizer.encode_plus(text,
                                            add_special_tokens=True,
                                            truncation=True,
                                            return_tensors='pt',
                                            max_length=self.max_len,
                                            padding='max_length',
                                            return_attention_mask=True)

        target = torch.tensor(item[self.target_cols].values.astype('float32'))

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "atten_mask": encoding["attention_mask"].squeeze(0),
            "hard_target": target
        }

In [8]:
data = GoEmotions_Dataset(df_train, roberta_tokenizer)

In [9]:
len(data)

43410

In [10]:
data.__getitem__(1)

{'input_ids': tensor([    0,  5975,   114,    37,   473,   160,  1003,     6,   961,    40,
           206, 36279,   519,    10,  7923, 21927,   154,    19,    82,  1386,
             9,   888,  1462,     2,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [None]:
train_dataloader = DataLoader(GoEmotions_Dataset(df_train, roberta_tokenizer), batch_size=64, num_workers=4, shuffle=True)
val_dataloader = DataLoader(GoEmotions_Dataset(df_val, roberta_tokenizer), batch_size=64, num_workers=4)

In [None]:
emotion_desc = [
    "respect and high regard for something excellent or skilled",  # admiration
    "finding something funny, lighthearted and entertaining",  # amusement
    "strong feeling of displeasure and hostility or tension",  # anger
    "mild irritation or impatience for disturbance or distraction",  # annoyance
    "favorable opinion, agreement with something",  # approval
    "showing concern, kindness for others well-being",  # caring
    "lacking understanding, unsure or uncertain",  # confusion
    "strong feeling to know or learn something",  # curiosity
    "strong feeling of wanting or wishing for something",  # desire
    "displeasure due to unfulfilled expectations",  # disappointment
    "unfavorable opinion or negative judgment",  # disapproval
    "feeling of revulsion or strong sickness due to some offensive",  # disgust
    "awkward feeling, or shame in public",  # embarrassment
    "intense enthusiasm, thrilled, energetic",  # excitement
    "acute alarm caused by perceived danger or threat",  # fear
    "thankfulness and appreciation for kindness",  # gratitude
    "deep sorrow, emotional pain caused by loss or death",  # grief
    "feeling great happiness and overwhelming delight",  # joy
    "deep affection or attraction, profound connection",  # love
    "feeling uneasy, worried or apprehensive about something",  # nervousness
    "hopefulness and confidence about something in future",  # optimism
    "satisfied with one's achievement or high standards",  # pride
    "suddenly achieving clear understanding, awareness",  # realization
    "feeling of release from anxiety, pain or distress",  # relief
    "deep regret or guilt for committed wrong doing",  # remorse
    "unhappy, sorrowful and lacking cheerfulness",  # sadness
    "an unexpected, startled feeling from something sudden",  # surprise
    "absence of strong emotion, neither positive or negative"  # neutral

]

In [13]:
emo_tokens = roberta_tokenizer(
                    emotion_desc,
                    truncation=True,
                    max_length=128,
                    padding='max_length',
                    return_tensors='pt',
                    return_attention_mask=True
                ).to(device)

## Model Architecture

In [None]:
class Encoder(nn.Module):

    def __init__(self, base_encoder):
        super().__init__()
        self.encoder = base_encoder

    def forward(self, inputs):
 
        outputs = self.encoder(**inputs, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]                                                            # [B, T, H]

        atten_mask = inputs['attention_mask']                                                                    # [B, T]
 
        atten_mask = atten_mask.unsqueeze(-1).float()
        text_emb = (last_hidden_state * atten_mask).sum(dim=1) / atten_mask.sum(dim=1).clamp(min=1e-9)           # [B, H]
        text_emb = F.normalize(text_emb, p=2, dim=1)

        return text_emb

In [15]:
class Classifier(nn.Module):
    def __init__(self, input_dim=768, num_classes=28):
        super().__init__()
        self.input_dim = input_dim

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.25),
            nn.Linear(512, num_classes)
        )

    def forward(self, h):
        return self.mlp(h)

In [18]:
# Main Model class
class EmoAxis(nn.Module):
    def __init__(self, encoder, classifier):
        super().__init__()
        self.encoder = encoder
        self.classifier = classifier

    def forward(self, inputs: dict):
        # Encoder
        outputs = self.encoder(inputs)

        # Classifier
        logits = self.classifier(outputs)

        return outputs, logits

In [19]:
encoder = Encoder(base_encoder=roberta_base)
classifier = Classifier()

# Initialize model
model = EmoAxis(
    encoder=encoder,
    classifier=classifier
)

In [None]:
def compute_loss(text_emb, emo_emb, probs, targets, temp=0.05, eps=0.04, gamma_pos=1, gamma_neg=3):

    logits = torch.matmul(text_emb, emo_emb.T) / temp                               # [B, C]

    loss1 = F.binary_cross_entropy_with_logits(logits,targets)

    probs = probs.clamp(eps, 1 - eps)
    pos_loss = -targets * torch.log(probs) * (1 - probs) ** gamma_pos
    neg_loss = -(1 - targets) * torch.log(1 - probs) * probs ** gamma_neg

    loss2 = (pos_loss + neg_loss).mean()

    return (0.5*loss1 + 0.5*loss2)

In [None]:
def freeze_encoder_layers(encoder, freeze_upto: int=0):
    roberta_base_model = encoder.encoder

    for name, param in roberta_base_model.named_parameters():
        param.requires_grad = True

    if freeze_upto >= 0:
        for layer_idx in range(freeze_upto + 1):
            for param in roberta_base_model.encoder.layer[layer_idx].parameters():
                param.requires_grad = False

    print(f"\nFrozen encoder layers - 0 to {freeze_upto}\n\n")

In [None]:
def evaluate(model, dataloader, device, threshold=0.5):

    model.eval()
    val_loss = 0.0
    total_samples = 0

    preds_all = []
    truths_all = []

    with torch.no_grad():
        emo_emb,_ = model(inputs={"input_ids": emo_tokens["input_ids"], "attention_mask": emo_tokens["attention_mask"]})

        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['atten_mask'].to(device)
            hard_target = batch['hard_target'].to(device)

            text_emb, logits = model(inputs={"input_ids": input_ids, "attention_mask": attention_mask})
            probs = torch.sigmoid(logits)

            loss = compute_loss(text_emb, emo_emb, probs, hard_target)

            preds = (probs >= threshold).int()

            preds_all.append(preds.cpu())
            truths_all.append(hard_target.cpu().int())

            batch_size = input_ids.size(0)
            val_loss += loss.item() * batch_size
            total_samples += batch_size


    # Concatenate all batches
    preds_all = torch.cat(preds_all, dim=0).numpy()
    truths_all = torch.cat(truths_all, dim=0).numpy()

    # Compute metrics
    avg_val_loss = val_loss / total_samples
    micro_f1 = f1_score(truths_all, preds_all, average='micro', zero_division=0)
    macro_f1 = f1_score(truths_all, preds_all, average='macro', zero_division=0)

    return {
        "avg_val_loss": avg_val_loss,
        "micro_f1": micro_f1,
        "macro_f1": macro_f1
    }

## Training

In [None]:
def train(
    model: torch.nn.Module,
    train_dataloader,
    val_dataloader,
    device: torch.device,
    epochs: int = 10,
    lr_encoder: float = 2.5e-5,
    lr_classifier: float = 1.5e-4,
    weight_decay: float = 0.001,
    warmup_ratio: float = 0.1,
    gradient_accumulation_steps: int = 2,
    max_grad_norm: float = 1.0,
    use_amp: bool = True
):
    model.to(device)

    steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
    total_steps = steps_per_epoch * epochs
    warmup_steps = int(total_steps * warmup_ratio)

    global_step = 0
    current_freeze_config = None

    scaler = GradScaler(enabled=(use_amp and device.type == 'cuda'))

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        total_samples = 0

        # Progressive unfreezing
        if epoch < 2:
            freeze_level = 7
        elif epoch < 4:
            freeze_level = 3
        else:
            freeze_level = -1

        # Reset optimizer/scheduler only if freeze config changed
        if freeze_level != current_freeze_config:
            freeze_encoder_layers(model.encoder, freeze_upto=freeze_level)
            current_freeze_config = freeze_level

            encoder_params, other_params = [], []
            for name, p in model.named_parameters():
                if not p.requires_grad:
                    continue
                if 'encoder' in name:
                    encoder_params.append(p)
                else:
                    other_params.append(p)

            optimizer = AdamW([
                {"params": encoder_params, "lr": lr_encoder},
                {"params": other_params, "lr": lr_classifier}
            ], weight_decay=weight_decay)

            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=total_steps
            )

        optimizer.zero_grad()

        for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['atten_mask'].to(device)
            hard_target = batch['hard_target'].to(device)

            batch_size = input_ids.size(0)

            with autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
                text_emb, logits = model(inputs={"input_ids": input_ids, "attention_mask": attention_mask})
                emo_emb, _ = model(inputs={"input_ids": emo_tokens["input_ids"], "attention_mask": emo_tokens["attention_mask"]})

                probs = torch.sigmoid(logits)
                raw_loss = compute_loss(text_emb, emo_emb, probs, hard_target)

            epoch_loss += raw_loss.item() * batch_size
            total_samples += batch_size

            loss = raw_loss / gradient_accumulation_steps
            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                scaler.step(optimizer)
                scheduler.step()
                scaler.update()
                optimizer.zero_grad()
                global_step += 1

        avg_train_loss = epoch_loss / total_samples
        print(f"> | Epoch {epoch+1}/{epochs} | Avg Train Loss: {avg_train_loss:.4f}")

        # Validation
        val_metrics = evaluate(model, val_dataloader, device)
        avg_val_loss = val_metrics.get("avg_val_loss", None)
        micro_F1 = val_metrics.get("micro_f1", -1)
        macro_F1 = val_metrics.get("macro_f1", -1)

        print(f"Validation | Avg Loss: {avg_val_loss:.4f}, Micro-F1: {micro_F1:.4f}, Macro-F1: {macro_F1:.4f}\n")

        # Save checkpoint for this epoch (weights + optimizer/scheduler/scaler + metadata)
        checkpoint = {
            "epoch": epoch + 1,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "scaler_state_dict": scaler.state_dict(),
            "global_step": global_step
        }
        os.makedirs("/content/model_checkpoints", exist_ok=True)
        epoch_ckpt_path = os.path.join("/content/model_checkpoints", f"checkpoint_epoch_{epoch+1}.pt")
        torch.save(checkpoint, epoch_ckpt_path)
        print(f"Saved epoch checkpoint: {epoch_ckpt_path}\n")

        # Memory cleanup
        torch.cuda.empty_cache()
        gc.collect()

    return model

In [25]:
trained_model = train(model, train_dataloader, val_dataloader, device)


Frozen encoder layers - 0 to 7




100%|██████████| 679/679 [05:34<00:00,  2.03it/s]

> | Epoch 1/10 | Avg Train Loss: 1.9961





Validation | Avg Loss: 0.0720, Micro-F1: 0.5010, Macro-F1: 0.2995

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_1.pt



100%|██████████| 679/679 [05:31<00:00,  2.05it/s]

> | Epoch 2/10 | Avg Train Loss: 0.0648





Validation | Avg Loss: 0.0579, Micro-F1: 0.5859, Macro-F1: 0.4416

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_2.pt


Frozen encoder layers - 0 to 3




100%|██████████| 679/679 [05:57<00:00,  1.90it/s]

> | Epoch 3/10 | Avg Train Loss: 0.0574





Validation | Avg Loss: 0.0557, Micro-F1: 0.5947, Macro-F1: 0.4736

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_3.pt



100%|██████████| 679/679 [05:57<00:00,  1.90it/s]

> | Epoch 4/10 | Avg Train Loss: 0.0539





Validation | Avg Loss: 0.0552, Micro-F1: 0.5922, Macro-F1: 0.5042

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_4.pt


Frozen encoder layers - 0 to -1




100%|██████████| 679/679 [06:22<00:00,  1.77it/s]

> | Epoch 5/10 | Avg Train Loss: 0.0499





Validation | Avg Loss: 0.0533, Micro-F1: 0.6190, Macro-F1: 0.5065

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_5.pt



100%|██████████| 679/679 [06:22<00:00,  1.77it/s]

> | Epoch 6/10 | Avg Train Loss: 0.0488





Validation | Avg Loss: 0.0546, Micro-F1: 0.6022, Macro-F1: 0.5199

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_6.pt



100%|██████████| 679/679 [06:22<00:00,  1.77it/s]

> | Epoch 7/10 | Avg Train Loss: 0.0448





Validation | Avg Loss: 0.0539, Micro-F1: 0.6152, Macro-F1: 0.5306

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_7.pt



100%|██████████| 679/679 [06:22<00:00,  1.78it/s]

> | Epoch 8/10 | Avg Train Loss: 0.0405





Validation | Avg Loss: 0.0566, Micro-F1: 0.5879, Macro-F1: 0.5234

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_8.pt



100%|██████████| 679/679 [06:22<00:00,  1.78it/s]

> | Epoch 9/10 | Avg Train Loss: 0.0361





Validation | Avg Loss: 0.0576, Micro-F1: 0.5949, Macro-F1: 0.5152

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_9.pt



100%|██████████| 679/679 [06:27<00:00,  1.75it/s]

> | Epoch 10/10 | Avg Train Loss: 0.0336





Validation | Avg Loss: 0.0593, Micro-F1: 0.5865, Macro-F1: 0.5198

Saved epoch checkpoint: /content/model_checkpoints/checkpoint_epoch_10.pt



# Testing

In [None]:
test_path = "/content/drive/MyDrive/Go-Emotions-Test.csv"

df_test = pd.read_csv(test_path)

In [27]:
test_dataloader = DataLoader(GoEmotions_Dataset(df_test, roberta_tokenizer), batch_size=64, num_workers=2)

In [None]:
def evaluate(model, dataloader, device, threshold=0.5):

    model.eval()

    preds_all = []
    truths_all = []

    with torch.no_grad():
        for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['atten_mask'].to(device)
            hard_target = batch['hard_target'].to(device)

            _,logits = model(inputs={"input_ids": input_ids, "attention_mask": attention_mask})

            probs = torch.sigmoid(logits)
            preds = (probs >= threshold).int()

            preds_all.append(preds.cpu())
            truths_all.append(hard_target.cpu().int())

    preds_all = torch.cat(preds_all, dim=0).numpy()
    truths_all = torch.cat(truths_all, dim=0).numpy()

    # Compute metrics
    micro_precision = precision_score(truths_all, preds_all, average='micro', zero_division=0)
    macro_precision = precision_score(truths_all, preds_all, average='macro', zero_division=0)

    micro_recall = recall_score(truths_all, preds_all, average='micro', zero_division=0)
    macro_recall = recall_score(truths_all, preds_all, average='macro', zero_division=0)

    micro_f1 = f1_score(truths_all, preds_all, average='micro', zero_division=0)
    macro_f1 = f1_score(truths_all, preds_all, average='macro', zero_division=0)

    print(f"\n\nMicro Precision: {micro_precision} \nMacro Precision: {macro_precision}\n")
    print(f"Micro Recall: {micro_recall} \nMacro Recall: {macro_recall}\n")
    print(f"Micro F1: {micro_f1} \nMacro F1: {macro_f1}")

In [39]:
trained_model = EmoAxis(encoder=encoder, classifier=classifier)
checkpoint = torch.load("/content/model_checkpoints/checkpoint_epoch_7.pt", map_location="cuda")
trained_model.load_state_dict(checkpoint["model_state_dict"], strict=False)
trained_model.eval()
trained_model.to(device)

EmoAxis(
  (encoder): Encoder(
    (encoder): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_features=768, o

In [40]:
evaluate(trained_model, test_dataloader, device)

100%|██████████| 85/85 [00:32<00:00,  2.64it/s]


Micro Precision: 0.5930382466695315
Macro Precision: 0.5704941965207051

Micro Recall: 0.6541317743719387
Macro Recall: 0.5724077340480935

Micro F1: 0.6220886551465064
Macro F1: 0.5578135211573036





In [None]:
from huggingface_hub import login, HfApi
login()

In [None]:
api = HfApi()

api.upload_file(
    path_or_fileobj="/content/model_checkpoints/checkpoint_epoch_7.pt",
    path_in_repo="EmoAxis-Go-Emotions.pt",
    repo_id="Hidden-States/roberta-base-go-emotions-pt-only",
)
