# Set-up

In [1]:
!pip install evaluate --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import cv2

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, DataCollatorWithPadding, AutoTokenizer
from transformers import TrainingArguments, Trainer
from evaluate import load

import os
from tqdm.auto import tqdm
import time
import random

import kagglehub
path = kagglehub.dataset_download('haoshaoyang/sst5-data')
data_path = '/kaggle/input/sst5-data/'
print(list(os.walk(data_path)))

import wandb
wandb.login(key="a505e496d38a3096fcef47944c848a5891788d98")

[('/kaggle/input/sst5-data/', ['sst'], []), ('/kaggle/input/sst5-data/sst', ['raw_data'], ['sst_dev.tsv', 'sst_test.tsv', 'README.md', 'sst_dev.txt', 'tree2tabular.py', 'sst_train.tsv', 'sst_test.txt', 'sst_train.txt']), ('/kaggle/input/sst5-data/sst/raw_data', [], ['test.txt', 'train.txt', 'dev.txt'])]


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33miq000012[0m ([33minfinitasfish[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Dataset prep

In [3]:
train_file = os.path.join(data_path, 'sst/sst_train.txt')
dev_file = os.path.join(data_path, 'sst/sst_dev.txt')
test_file = os.path.join(data_path, 'sst/sst_test.txt')

train_df = pd.read_csv(train_file, delimiter='\t', header=None)
dev_df = pd.read_csv(dev_file, delimiter='\t', header=None)
test_df = pd.read_csv(test_file, delimiter='\t', header=None)

print(len(train_df), len(dev_df), len(test_df))
print(train_df[1].str.len().max(), dev_df[1].str.len().max(), test_df[1].str.len().max())

8544 1101 2210
267 254 256


In [4]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.rename_columns({'0': 'labels', '1': 'text'})

val_dataset = Dataset.from_pandas(dev_df)
val_dataset = val_dataset.rename_columns({'0': 'labels', '1': 'text'})

test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.rename_columns({'0': 'labels', '1': 'text'})

text_labels = ['__label__1', '__label__2', '__label__3', '__label__4', '__label__5']
label_dict = {k:i for i, k in enumerate(text_labels)}

In [5]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [6]:
def preprocess_function(examples):
   return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

tokenized_train = tokenized_train.map(lambda e: {'labels': label_dict[e['labels']]})
tokenized_val = tokenized_val.map(lambda e: {'labels': label_dict[e['labels']]})
tokenized_test = tokenized_test.map(lambda e: {'labels': label_dict[e['labels']]})

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Pretrained ransformers baseline, clf head only ('eval_f1': macro 0.4636)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

# only 3 labels
model.classifier

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=3, bias=True)
)

In [16]:

f1_metric = load("f1")
accuracy_metric = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    
    return {
        "f1": f1,
        "accuracy": accuracy,
    }

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
class RobertaSentiment(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        
        full_model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.roberta = full_model.roberta
        self.classifier = nn.Sequential(nn.Linear(768, 768, bias=True),
                                        nn.Dropout(p=0.1, inplace=False),
                                        nn.Linear(768, num_labels, bias=True))    
        self.loss_fn = nn.CrossEntropyLoss()
        del full_model
        
        # frozen backbone
        for param in self.roberta.parameters():
            param.requires_grad = False
    

    def forward(self, input_ids, attention_mask, labels=None, **kwargs):
        out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pool = out.last_hidden_state[:, 0, :]
        logits = self.classifier(pool)
        
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
    

In [22]:
MODEL = RobertaSentiment(model_name='cardiffnlp/twitter-roberta-base-sentiment', num_labels=5)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LR = 2e-5
BATCH_SIZE = 64
EPOCHS = 20

MODEL.to(DEVICE);

In [23]:
training_args = TrainingArguments(
    output_dir='/kaggle/working',
    remove_unused_columns=True,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    save_strategy='epoch',
    save_total_limit=3,
    eval_strategy='epoch',
    logging_steps=50,
    report_to='wandb',
    run_name='stt5_twitterbert_finetune_2005',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
)

trainer = Trainer(
   model=MODEL,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_val,
   processing_class=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,1.2429,1.203771,0.407464,0.46049
2,1.1988,1.185892,0.445293,0.462307
3,1.1836,1.181465,0.421239,0.459582
4,1.1774,1.174404,0.448004,0.474114
5,1.1875,1.166214,0.449519,0.475931
6,1.169,1.161175,0.436516,0.469573
7,1.1758,1.161925,0.451805,0.475931
8,1.1717,1.151691,0.45757,0.481381
9,1.1707,1.151176,0.45007,0.478656
10,1.1616,1.155669,0.452665,0.481381


TrainOutput(global_step=2680, training_loss=1.1695085013090674, metrics={'train_runtime': 1539.4694, 'train_samples_per_second': 110.999, 'train_steps_per_second': 1.741, 'total_flos': 0.0, 'train_loss': 1.1695085013090674, 'epoch': 20.0})

In [25]:
trainer.evaluate()

{'eval_loss': 1.1469286680221558,
 'eval_f1': 0.46364350399575144,
 'eval_accuracy': 0.485921889191644,
 'eval_runtime': 8.2652,
 'eval_samples_per_second': 133.209,
 'eval_steps_per_second': 2.178,
 'epoch': 20.0}

# Same baseline, encoder+embeds unfreezing

In [8]:
class RobertaSentimentUnfreeze(nn.Module):
    def __init__(self, model_name, num_labels, unfreeze_strat, unfreeze_epoch):
        super().__init__()
        
        full_model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.roberta = full_model.roberta
        self.classifier = nn.Sequential(nn.Linear(768, 768, bias=True),
                                        nn.Dropout(p=0.1, inplace=False),
                                        nn.Linear(768, num_labels, bias=True))    
        self.loss_fn = nn.CrossEntropyLoss()
        del full_model
        
        self.unfreeze_strat = unfreeze_strat
        self.encoder_layers = len(self.roberta.encoder.layer)
        self.is_unfreezed = False
        self.unfreeze_epoch = unfreeze_epoch
        self._freeze_all()


    def _freeze_all(self):
        for param in self.roberta.parameters():
            param.requires_grad = False

    def _unfreeze_all_after_n(self, epoch):
        if epoch > self.unfreeze_epoch:
            print('unfreezing all')
            for param in self.roberta.parameters():
                param.requires_grad = True
            self.is_unfreezed = True        

    def _unfreeze_layerwise(self, epoch):
        if epoch <= self.unfreeze_epoch:
            return

        # unfreeze 1 layer each '// n' epoch
        layers_to_unfreeze = min((epoch - self.unfreeze_epoch) // 8, self.encoder_layers)
        for i in range(self.encoder_layers - layers_to_unfreeze, self.encoder_layers):
            for param in self.roberta.encoder.layer[i].parameters():
                param.requires_grad = True
                
        print(f'unfreezing {layers_to_unfreeze} top layers')

        if layers_to_unfreeze >= self.encoder_layers:
            self.is_unfreezed = True

    def update_unfreeze_status(self, epoch):
        if self.is_unfreezed:
            return
        elif self.unfreeze_strat == 'after_n':
            self._unfreeze_all_after_n(epoch)
        elif self.unfreeze_strat == 'layerwise':
            self._unfreeze_layerwise(epoch)

    def forward(self, input_ids, attention_mask, labels=None, **kwargs):
        out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pool = out.last_hidden_state[:, 0, :]
        logits = self.classifier(pool)
        
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
    

In [9]:

UNFREEZE_EPOCH = 3
# MODEL = RobertaSentimentUnfreeze(model_name='cardiffnlp/twitter-roberta-base-sentiment', 
#                                  num_labels=5, unfreeze_strat='after_n', unfreeze_epoch=UNFREEZE_EPOCH)

MODEL = RobertaSentimentUnfreeze(model_name='cardiffnlp/twitter-roberta-base-sentiment', 
                                 num_labels=5, unfreeze_strat='layerwise', unfreeze_epoch=UNFREEZE_EPOCH)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CLF_LR = 8e-5
BACKBONE_LR = 8e-5
WD = 0.01
BATCH_SIZE = 26
EPOCHS = 40

TRAIN_LOSSI = []
VAL_LOSSI = []

torch.cuda.empty_cache()
MODEL.to(DEVICE);

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [14]:
classifier_params = sum(p.numel() for p in MODEL.classifier.parameters())
roberta_params = sum(p.numel() for p in MODEL.roberta.parameters())
total_params = classifier_params + roberta_params

print(f'Model contains {total_params:,} parameters')

Model contains 124,649,477 parameters


In [8]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(x['input_ids']) for x in batch]),
        'attention_mask': torch.stack([torch.tensor(x['attention_mask']) for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [13]:
train_loader = DataLoader(tokenized_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_val, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(tokenized_test, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [10]:
def train_loop(model, train_loader, val_loader, epochs, clf_lr, back_lr, wd, tlossi, vlossi, tf1s, vf1s, save_fname):
    
    optimizer = AdamW([
        {'params': model.classifier.parameters(), 'lr': clf_lr},
        {'params': [p for p in model.roberta.parameters() if p.requires_grad], 'lr': back_lr}
    ], weight_decay=wd)

    load_f1 = load("f1")
    load_accuracy = load("accuracy")
    best_val_metrics_sum = -1

    for epoch in tqdm(range(epochs)):

        # basic lr scheduler
        if epoch > 5:
            clf_lr *= 0.9
            back_lr *= 0.9
        
        model.update_unfreeze_status(epoch)

        new_unfreezed_params = [p for p in model.roberta.parameters() if p.requires_grad]
        
        param_groups = [
            {'params': model.classifier.parameters(), 'lr': clf_lr},
            {'params': new_unfreezed_params, 'lr': back_lr}
        ]
        optimizer = AdamW(param_groups, weight_decay=wd)

        all_preds_train = []
        all_labels_train = []
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            inputs = batch['input_ids'].to(DEVICE)
            masks = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
            logits = outputs['logits']
            loss = outputs['loss']
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            train_loss += loss.item()

            preds = logits.argmax(dim=-1).detach().cpu().numpy()
            all_preds_train.extend(preds)
            all_labels_train.extend(labels)

        epoch_train_loss = train_loss/len(train_loader)
        epoch_train_f1 = load_f1.compute(predictions=all_preds_train, references=all_labels_train, average='macro')["f1"]
        
        tlossi.append(epoch_train_loss)
        tf1s.append(epoch_train_f1)

        all_preds_val = []
        all_labels_val = []
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch['input_ids'].to(DEVICE)
                masks = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                
                outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
                logits = outputs['logits']
                loss = outputs['loss']
                val_loss += loss.item()
                
                preds = logits.argmax(dim=-1).detach().cpu().numpy()
                all_preds_val.extend(preds)
                all_labels_val.extend(labels)
                
        epoch_val_loss = val_loss/len(val_loader)
        epoch_val_f1 = load_f1.compute(predictions=all_preds_val, references=all_labels_val, average='macro')["f1"]
        epoch_val_acc = load_accuracy.compute(predictions=all_preds_val, references=all_labels_val)["accuracy"]
        vlossi.append(epoch_val_loss)
        vf1s.append(epoch_val_f1)

        val_metrics_sum = epoch_val_f1 + epoch_val_acc
        # save best model
        if val_metrics_sum > best_val_metrics_sum:
            best_val_metrics_sum = val_metrics_sum
            torch.save(model.state_dict(), f'{save_fname}.pth')
            
            
        print(f'Epoch {epoch+1} | Train Loss: {epoch_train_loss:.4f} | Train F1: {epoch_train_f1:.4f} | Val Loss: {epoch_val_loss:.4f} | Val F1: {epoch_val_f1:.4f} | Val Acc: {epoch_val_acc:.4f}')
        torch.cuda.empty_cache()
        

In [32]:
train_lossi = []
train_f1s = []
val_lossi = []
val_f1s = []

s = time.time()
train_loop(MODEL, train_loader, val_loader, EPOCHS, CLF_LR, BACKBONE_LR, WD, train_lossi, val_lossi, train_f1s, val_f1s, 'layerwise_model_weights')
e = time.time()
print(f'Training completed in {(e-s)/60:.4f} mins')

  0%|          | 0/40 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 1.2154 | Train F1: 0.4154 | Val Loss: 1.1609 | Val F1: 0.4478 | Val Acc: 0.4723


Epoch 2:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 1.1700 | Train F1: 0.4414 | Val Loss: 1.1311 | Val F1: 0.4609 | Val Acc: 0.4805


Epoch 3:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 1.1543 | Train F1: 0.4533 | Val Loss: 1.1424 | Val F1: 0.4225 | Val Acc: 0.4759


Epoch 4:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 1.1316 | Train F1: 0.4650 | Val Loss: 1.1601 | Val F1: 0.4560 | Val Acc: 0.4886
unfreezing 0 top layers


Epoch 5:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 1.1382 | Train F1: 0.4591 | Val Loss: 1.1373 | Val F1: 0.4813 | Val Acc: 0.4868
unfreezing 0 top layers


Epoch 6:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 6 | Train Loss: 1.1217 | Train F1: 0.4621 | Val Loss: 1.1349 | Val F1: 0.4416 | Val Acc: 0.4914
unfreezing 0 top layers


Epoch 7:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 7 | Train Loss: 1.1234 | Train F1: 0.4726 | Val Loss: 1.1405 | Val F1: 0.4719 | Val Acc: 0.4932
unfreezing 0 top layers


Epoch 8:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 8 | Train Loss: 1.1168 | Train F1: 0.4753 | Val Loss: 1.1168 | Val F1: 0.4571 | Val Acc: 0.4896
unfreezing 0 top layers


Epoch 9:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 9 | Train Loss: 1.1174 | Train F1: 0.4743 | Val Loss: 1.1266 | Val F1: 0.4771 | Val Acc: 0.5014
unfreezing 0 top layers


Epoch 10:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 1.1136 | Train F1: 0.4741 | Val Loss: 1.1169 | Val F1: 0.4765 | Val Acc: 0.4995
unfreezing 0 top layers


Epoch 11:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 11 | Train Loss: 1.1087 | Train F1: 0.4756 | Val Loss: 1.1185 | Val F1: 0.4888 | Val Acc: 0.5032
unfreezing 1 top layers


Epoch 12:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 12 | Train Loss: 1.1287 | Train F1: 0.4708 | Val Loss: 1.1702 | Val F1: 0.4520 | Val Acc: 0.4832
unfreezing 1 top layers


Epoch 13:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 13 | Train Loss: 1.1099 | Train F1: 0.4824 | Val Loss: 1.1264 | Val F1: 0.4496 | Val Acc: 0.4995
unfreezing 1 top layers


Epoch 14:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 14 | Train Loss: 1.0974 | Train F1: 0.4802 | Val Loss: 1.1281 | Val F1: 0.4736 | Val Acc: 0.5123
unfreezing 1 top layers


Epoch 15:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 15 | Train Loss: 1.0722 | Train F1: 0.4979 | Val Loss: 1.1056 | Val F1: 0.4407 | Val Acc: 0.4995
unfreezing 1 top layers


Epoch 16:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 16 | Train Loss: 1.0640 | Train F1: 0.5049 | Val Loss: 1.1065 | Val F1: 0.4787 | Val Acc: 0.5059
unfreezing 1 top layers


Epoch 17:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 17 | Train Loss: 1.0517 | Train F1: 0.5091 | Val Loss: 1.1220 | Val F1: 0.4917 | Val Acc: 0.4932
unfreezing 1 top layers


Epoch 18:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 18 | Train Loss: 1.0418 | Train F1: 0.5146 | Val Loss: 1.1358 | Val F1: 0.4837 | Val Acc: 0.4950
unfreezing 1 top layers


Epoch 19:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 19 | Train Loss: 1.0302 | Train F1: 0.5218 | Val Loss: 1.1097 | Val F1: 0.4986 | Val Acc: 0.5068
unfreezing 2 top layers


Epoch 20:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 20 | Train Loss: 1.0331 | Train F1: 0.5219 | Val Loss: 1.1555 | Val F1: 0.4652 | Val Acc: 0.4750
unfreezing 2 top layers


Epoch 21:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 21 | Train Loss: 1.0126 | Train F1: 0.5306 | Val Loss: 1.1174 | Val F1: 0.4944 | Val Acc: 0.5041
unfreezing 2 top layers


Epoch 22:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 22 | Train Loss: 0.9916 | Train F1: 0.5430 | Val Loss: 1.1493 | Val F1: 0.4967 | Val Acc: 0.5104
unfreezing 2 top layers


Epoch 23:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 23 | Train Loss: 0.9715 | Train F1: 0.5646 | Val Loss: 1.1320 | Val F1: 0.4980 | Val Acc: 0.5114
unfreezing 2 top layers


Epoch 24:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 24 | Train Loss: 0.9583 | Train F1: 0.5668 | Val Loss: 1.1638 | Val F1: 0.4878 | Val Acc: 0.4923
unfreezing 2 top layers


Epoch 25:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 25 | Train Loss: 0.9370 | Train F1: 0.5752 | Val Loss: 1.1474 | Val F1: 0.4930 | Val Acc: 0.5104
unfreezing 2 top layers


Epoch 26:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 26 | Train Loss: 0.9214 | Train F1: 0.5787 | Val Loss: 1.1891 | Val F1: 0.4831 | Val Acc: 0.4886
unfreezing 2 top layers


Epoch 27:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 27 | Train Loss: 0.9107 | Train F1: 0.5876 | Val Loss: 1.2334 | Val F1: 0.4738 | Val Acc: 0.4914
unfreezing 3 top layers


Epoch 28:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 28 | Train Loss: 0.9049 | Train F1: 0.5872 | Val Loss: 1.2246 | Val F1: 0.4734 | Val Acc: 0.4823
unfreezing 3 top layers


Epoch 29:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 29 | Train Loss: 0.8892 | Train F1: 0.5950 | Val Loss: 1.1933 | Val F1: 0.4861 | Val Acc: 0.4932
unfreezing 3 top layers


Epoch 30:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 30 | Train Loss: 0.8626 | Train F1: 0.6213 | Val Loss: 1.2012 | Val F1: 0.4917 | Val Acc: 0.5077
unfreezing 3 top layers


Epoch 31:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 31 | Train Loss: 0.8503 | Train F1: 0.6217 | Val Loss: 1.2395 | Val F1: 0.4904 | Val Acc: 0.5023
unfreezing 3 top layers


Epoch 32:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 32 | Train Loss: 0.8366 | Train F1: 0.6274 | Val Loss: 1.2215 | Val F1: 0.4850 | Val Acc: 0.5059
unfreezing 3 top layers


Epoch 33:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 33 | Train Loss: 0.8326 | Train F1: 0.6322 | Val Loss: 1.2187 | Val F1: 0.4911 | Val Acc: 0.5023
unfreezing 3 top layers


Epoch 34:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 34 | Train Loss: 0.8149 | Train F1: 0.6423 | Val Loss: 1.2645 | Val F1: 0.4870 | Val Acc: 0.4941
unfreezing 3 top layers


Epoch 35:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 35 | Train Loss: 0.8015 | Train F1: 0.6537 | Val Loss: 1.2553 | Val F1: 0.4919 | Val Acc: 0.4986
unfreezing 4 top layers


Epoch 36:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 36 | Train Loss: 0.7908 | Train F1: 0.6531 | Val Loss: 1.2347 | Val F1: 0.4829 | Val Acc: 0.4977
unfreezing 4 top layers


Epoch 37:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 37 | Train Loss: 0.7805 | Train F1: 0.6566 | Val Loss: 1.2598 | Val F1: 0.4897 | Val Acc: 0.4977
unfreezing 4 top layers


Epoch 38:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 38 | Train Loss: 0.7611 | Train F1: 0.6669 | Val Loss: 1.2618 | Val F1: 0.5020 | Val Acc: 0.5050
unfreezing 4 top layers


Epoch 39:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 39 | Train Loss: 0.7561 | Train F1: 0.6705 | Val Loss: 1.2639 | Val F1: 0.4911 | Val Acc: 0.5059
unfreezing 4 top layers


Epoch 40:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 40 | Train Loss: 0.7494 | Train F1: 0.6739 | Val Loss: 1.2669 | Val F1: 0.5075 | Val Acc: 0.5123
Training completed in 64.1583 mins


In [9]:
def evaluate_model(model, val_loader):
    accuracy_metric = load("accuracy")
    f1_metric = load("f1")
    
    all_preds_val = []
    all_labels_val = []
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            inputs = batch['input_ids'].to(DEVICE)
            masks = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
                
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
            logits = outputs['logits']
                
            preds = logits.argmax(dim=-1).detach().cpu().numpy()
            all_preds_val.extend(preds)
            all_labels_val.extend(labels)
                
    val_f1 = f1_metric.compute(predictions=all_preds_val, references=all_labels_val, average='macro')["f1"]
    val_acc = accuracy_metric.compute(predictions=all_preds_val, references=all_labels_val)["accuracy"]
    return {
        "f1": val_f1,
        "accuracy": val_acc,
    }

In [33]:
MODEL.load_state_dict(torch.load('/kaggle/input/sst5_tuned/pytorch/sst5_full_tuned_0525/1/after_n_model_weights_0525.pth', map_location=DEVICE, weights_only=True))
print(evaluate_model(MODEL, test_loader))
MODEL.load_state_dict(torch.load('/kaggle/working/layerwise_model_weights.pth', map_location=DEVICE, weights_only=True))
print(evaluate_model(MODEL, test_loader))

{'f1': 0.561599363852188, 'accuracy': 0.5669683257918552}
{'f1': 0.5283791696600086, 'accuracy': 0.5425339366515837}


# Trained full model + prompted dataset 'f1 macro': 0.5556695031280695, 'accuracy': 0.574660633484162

In [10]:
# llm_model_name = 'HuggingFaceH4/zephyr-7b-alpha'
# llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
# llm_model = AutoModelForCausalLM.from_pretrained(llm_model_name, torch_dtype=torch.float16, device_map="auto")

def generate_prompts(initial_prompt, model, tokenizer, num_variants=5):
    prompts = []
    for _ in range(num_variants):
        input_text = f"<|system|>Rephrase the following prompt for sentiment analysis, keeping the core meaning but varying the style:</s>\n<|user|>Original: {initial_prompt}\nNew:</s>\n<|assistant|>"
        inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True)
        new_prompt = tokenizer.decode(outputs[0], skip_special_tokens=True).split("New:")[-1].strip()
        prompts.append(new_prompt)
    return prompts

initial_prompt = '''Movie review sentiment classification task: 
From the following five options - very negative, negative, neutral, positive, or very positive (0-4) - which best describes this review? '''

# generated_prompts = generate_prompts(initial_prompt, llm_model, llm_tokenizer)

# <|assistant|>Sentiment analysis task: Identify the sentiment of a given movie review based on whether it is very negative, negative, neutral, positive, or very positive. Which option aligns best with the review's overall tone?
# selected_llm_prompt = generated_prompts[0]
# print(selected_llm_prompt)

# selected_llm_prompt = "<|assistant|>Sentiment analysis task: Identify the sentiment of a given movie review based on whether it is very negative, negative, neutral, positive, or very positive. Which option aligns best with the review's overall tone?"
selected_llm_prompt = initial_prompt

In [11]:
text_wprompt = [f'{selected_llm_prompt}\n\nReview: {text}\n\nSentiment:' for text in train_dataset['text']]
train_dataset = train_dataset.add_column("text_wprompt", text_wprompt)

text_wprompt_val = [f'{selected_llm_prompt}\n\nReview: {text}\n\nSentiment:' for text in val_dataset['text']]
val_dataset = val_dataset.add_column("text_wprompt", text_wprompt_val)

text_wprompt_test = [f'{selected_llm_prompt}\n\nReview: {text}\n\nSentiment:' for text in test_dataset['text']]
test_dataset = test_dataset.add_column("text_wprompt", text_wprompt_test)

In [12]:
def preprocess_function_wprompt(examples):
    return tokenizer(examples['text_wprompt'], truncation=True, padding='max_length', max_length=512)

tokenized_wprompt_train = train_dataset.map(preprocess_function_wprompt, batched=True)
tokenized_wprompt_val = val_dataset.map(preprocess_function_wprompt, batched=True)
tokenized_wprompt_test = test_dataset.map(preprocess_function_wprompt, batched=True)

tokenized_wprompt_train = tokenized_wprompt_train.map(lambda e: {'labels': label_dict[e['labels']]})
tokenized_wprompt_val = tokenized_wprompt_val.map(lambda e: {'labels': label_dict[e['labels']]})
tokenized_wprompt_test = tokenized_wprompt_test.map(lambda e: {'labels': label_dict[e['labels']]})

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

In [20]:

UNFREEZE_EPOCH = 5
MODEL = RobertaSentimentUnfreeze(model_name='cardiffnlp/twitter-roberta-base-sentiment', 
                                 num_labels=5, unfreeze_strat='after_n', unfreeze_epoch=UNFREEZE_EPOCH)
#MODEL.load_state_dict(torch.load('/kaggle/input/sst5_tuned/pytorch/default/1/after_n_model_weights.pth', weights_only=True))

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CLF_LR = 5e-5
BACKBONE_LR = 5e-6  # very small so it won't overfit so fast
WD = 0.01
BATCH_SIZE = 26
EPOCHS = 30

TRAIN_LOSSI = []
VAL_LOSSI = []

torch.cuda.empty_cache()
MODEL.to(DEVICE);

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [13]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(x['input_ids']) for x in batch]),
        'attention_mask': torch.stack([torch.tensor(x['attention_mask']) for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

train_loader = DataLoader(tokenized_wprompt_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_wprompt_val, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(tokenized_wprompt_test, batch_size=BATCH_SIZE, collate_fn=collate_fn)

NameError: name 'BATCH_SIZE' is not defined

In [22]:
train_lossi = []
train_f1s = []
val_lossi = []
val_f1s = []

s = time.time()
train_loop(MODEL, train_loader, val_loader, EPOCHS, CLF_LR, BACKBONE_LR, WD, train_lossi, val_lossi, train_f1s, val_f1s, 'aftern_wprompt_weights')
e = time.time()
print(f'Training completed in {(e-s)/60:.4f} mins')

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 1.3117 | Train F1: 0.3348 | Val Loss: 1.2409 | Val F1: 0.4247 | Val Acc: 0.4414


Epoch 2:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 1.2760 | Train F1: 0.3729 | Val Loss: 1.2249 | Val F1: 0.3761 | Val Acc: 0.4441


Epoch 3:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 1.2551 | Train F1: 0.3789 | Val Loss: 1.2003 | Val F1: 0.4072 | Val Acc: 0.4623


Epoch 4:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 1.2559 | Train F1: 0.3833 | Val Loss: 1.2013 | Val F1: 0.4154 | Val Acc: 0.4532


Epoch 5:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 1.2413 | Train F1: 0.4017 | Val Loss: 1.1897 | Val F1: 0.4172 | Val Acc: 0.4650


Epoch 6:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 6 | Train Loss: 1.2411 | Train F1: 0.3884 | Val Loss: 1.2162 | Val F1: 0.4313 | Val Acc: 0.4514
unfreezing all


Epoch 7:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 7 | Train Loss: 1.1274 | Train F1: 0.4630 | Val Loss: 1.0643 | Val F1: 0.4770 | Val Acc: 0.5186


Epoch 8:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 8 | Train Loss: 1.0119 | Train F1: 0.5228 | Val Loss: 1.1125 | Val F1: 0.5105 | Val Acc: 0.5295


Epoch 9:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 9 | Train Loss: 0.9535 | Train F1: 0.5532 | Val Loss: 1.0641 | Val F1: 0.5353 | Val Acc: 0.5477


Epoch 10:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 0.9016 | Train F1: 0.5864 | Val Loss: 1.0643 | Val F1: 0.5253 | Val Acc: 0.5531


Epoch 11:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 11 | Train Loss: 0.8527 | Train F1: 0.6082 | Val Loss: 1.1062 | Val F1: 0.5514 | Val Acc: 0.5631


Epoch 12:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 12 | Train Loss: 0.8044 | Train F1: 0.6301 | Val Loss: 1.1474 | Val F1: 0.5378 | Val Acc: 0.5522


Epoch 13:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 13 | Train Loss: 0.7723 | Train F1: 0.6491 | Val Loss: 1.1763 | Val F1: 0.5264 | Val Acc: 0.5377


Epoch 14:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 14 | Train Loss: 0.7431 | Train F1: 0.6652 | Val Loss: 1.1889 | Val F1: 0.5368 | Val Acc: 0.5486


Epoch 15:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 15 | Train Loss: 0.7094 | Train F1: 0.6765 | Val Loss: 1.2548 | Val F1: 0.5350 | Val Acc: 0.5513


Epoch 16:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 16 | Train Loss: 0.6757 | Train F1: 0.7070 | Val Loss: 1.2681 | Val F1: 0.5409 | Val Acc: 0.5477


Epoch 17:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 17 | Train Loss: 0.6532 | Train F1: 0.7136 | Val Loss: 1.2793 | Val F1: 0.5398 | Val Acc: 0.5495


Epoch 18:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 18 | Train Loss: 0.6279 | Train F1: 0.7255 | Val Loss: 1.3584 | Val F1: 0.5337 | Val Acc: 0.5359


Epoch 19:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 19 | Train Loss: 0.6149 | Train F1: 0.7321 | Val Loss: 1.3557 | Val F1: 0.5402 | Val Acc: 0.5441


Epoch 20:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 20 | Train Loss: 0.5859 | Train F1: 0.7478 | Val Loss: 1.4007 | Val F1: 0.5318 | Val Acc: 0.5368


Epoch 21:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 21 | Train Loss: 0.5732 | Train F1: 0.7533 | Val Loss: 1.4041 | Val F1: 0.5438 | Val Acc: 0.5468


Epoch 22:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 22 | Train Loss: 0.5553 | Train F1: 0.7653 | Val Loss: 1.4779 | Val F1: 0.5279 | Val Acc: 0.5322


Epoch 23:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 23 | Train Loss: 0.5463 | Train F1: 0.7725 | Val Loss: 1.4632 | Val F1: 0.5376 | Val Acc: 0.5441


Epoch 24:   0%|          | 0/329 [00:00<?, ?it/s]

Epoch 24 | Train Loss: 0.5224 | Train F1: 0.7806 | Val Loss: 1.4844 | Val F1: 0.5365 | Val Acc: 0.5395


Epoch 25:   0%|          | 0/329 [00:00<?, ?it/s]

KeyboardInterrupt: 

KeyboardInterrupt when overfitting

In [23]:
# test texts also have prompt

MODEL.load_state_dict(torch.load('/kaggle/working/aftern_wprompt_weights.pth', map_location=DEVICE, weights_only=True))
print(evaluate_model(MODEL, test_loader))

{'f1': 0.5556695031280695, 'accuracy': 0.5746606334841629}


# Trying triplet loss

In [15]:
train_dataset = train_dataset.map(lambda e: {'labels': label_dict[e['labels']]})
val_dataset = val_dataset.map(lambda e: {'labels': label_dict[e['labels']]})
test_dataset = test_dataset.map(lambda e: {'labels': label_dict[e['labels']]})

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

In [39]:
class RobertaSentimentUnfreezeTriplet(nn.Module):
    def __init__(self, model_name, num_labels, unfreeze_strat, unfreeze_epoch, triplet_loss_weight):
        super().__init__()
        
        full_model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.roberta = full_model.roberta
        self.classifier = nn.Sequential(nn.Linear(768, 768, bias=True),
                                        nn.Dropout(p=0.1, inplace=False),
                                        nn.Linear(768, num_labels, bias=True))
        self.triplet_loss_weight = triplet_loss_weight
        self.triplet_loss_fn = nn.TripletMarginLoss(margin=1.0, p=2)
        self.ce_loss_fn = nn.CrossEntropyLoss()
        del full_model
        
        self.unfreeze_strat = unfreeze_strat
        self.encoder_layers = len(self.roberta.encoder.layer)
        self.is_unfreezed = False
        self.unfreeze_epoch = unfreeze_epoch
        self._freeze_all()

        # cuda out of memory
        self.roberta.gradient_checkpointing_enable()
    
    def _freeze_all(self):
        for param in self.roberta.parameters():
            param.requires_grad = False

    def _unfreeze_all_after_n(self, epoch):
        if epoch > self.unfreeze_epoch:
            print('unfreezing all')
            for param in self.roberta.parameters():
                param.requires_grad = True
            self.is_unfreezed = True        

    def _unfreeze_layerwise(self, epoch):
        if epoch <= self.unfreeze_epoch:
            return

        # unfreeze 1 layer each '// n' epoch
        layers_to_unfreeze = min((epoch - self.unfreeze_epoch) // 8, self.encoder_layers)
        for i in range(self.encoder_layers - layers_to_unfreeze, self.encoder_layers):
            for param in self.roberta.encoder.layer[i].parameters():
                param.requires_grad = True
                
        print(f'unfreezing {layers_to_unfreeze} top layers')

        if layers_to_unfreeze >= self.encoder_layers:
            self.is_unfreezed = True

    def update_unfreeze_status(self, epoch):
        if self.is_unfreezed:
            return
        elif self.unfreeze_strat == 'after_n':
            self._unfreeze_all_after_n(epoch)
        elif self.unfreeze_strat == 'layerwise':
            self._unfreeze_layerwise(epoch)

    def forward(self, input_ids, attention_mask, labels=None, anchor_ids=None, positive_ids=None, negative_ids=None):
        out = self.roberta(input_ids=input_ids)
        pool = out.last_hidden_state[:, 0, :]
        logits = self.classifier(pool)

        triplet_loss = None
        if anchor_ids is not None and positive_ids is not None and negative_ids is not None:
            anchor_emb = self.roberta(anchor_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
            positive_emb = self.roberta(positive_ids).last_hidden_state[:, 0, :]
            negative_emb = self.roberta(negative_ids).last_hidden_state[:, 0, :]
            triplet_loss = self.triplet_loss_fn(anchor_emb, positive_emb, negative_emb)
        
        if labels is not None:
            ce_loss = self.ce_loss_fn(logits, labels)
            total_loss = ce_loss + self.triplet_loss_weight * triplet_loss if triplet_loss is not None else ce_loss
            return {"loss": total_loss, "logits": logits, "triplet_loss": triplet_loss}
        return {"logits": logits}
    

In [40]:
def generate_triplets(dataset):
    triplets = []
    class_to_indices = {label: [] for label in set(dataset['labels'])}
    
    for idx, label in enumerate(dataset['labels']):
            class_to_indices[label].append(idx)
        
    for idx, (text, label) in enumerate(zip(dataset['text_wprompt'], dataset['labels'])):
        positive_idx = random.choice(class_to_indices[label])
            
        # close class for negative example
        negative_label = abs(label + random.choice([-1, 1]))
        if negative_label == 5:
            negative_label = 3
            
        negative_idx = random.choice(class_to_indices[negative_label])
            
        triplets.append({
            'anchor': text,
            'positive': dataset['text_wprompt'][positive_idx],
            'negative': dataset['text_wprompt'][negative_idx],
            'label': label
        })
        
    return Dataset.from_dict({
        'anchor': [t['anchor'] for t in triplets],
        'positive': [t['positive'] for t in triplets],
        'negative': [t['negative'] for t in triplets],
        'label': [t['label'] for t in triplets]
    })

In [17]:
train_triplets = generate_triplets(train_dataset)

In [41]:
def triplet_collate_fn(batch):
    anchor = tokenizer(
        [x['anchor'] for x in batch],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    
    positive = tokenizer(
        [x['positive'] for x in batch],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    
    negative = tokenizer(
        [x['negative'] for x in batch],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    
    return {
        'input_ids': anchor['input_ids'],
        'attention_mask': anchor['attention_mask'],
        'anchor_ids': anchor['input_ids'],
        #'anchor_mask': anchor['attention_mask'],
        'positive_ids': positive['input_ids'],
        #'positive_mask': positive['attention_mask'],
        'negative_ids': negative['input_ids'],
        #'negative_mask': negative['attention_mask'],
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [51]:
MODEL.to('cpu')
torch.cuda.empty_cache()
del MODEL

In [52]:

UNFREEZE_EPOCH = 7
TP_LOSS_WEIGHT = 0.2
MODEL = RobertaSentimentUnfreezeTriplet(model_name='cardiffnlp/twitter-roberta-base-sentiment', 
                                        num_labels=5, unfreeze_strat='after_n', unfreeze_epoch=UNFREEZE_EPOCH, triplet_loss_weight=TP_LOSS_WEIGHT)
#MODEL.load_state_dict(torch.load('/kaggle/input/sst5_tuned/pytorch/default/1/after_n_model_weights.pth', weights_only=True))

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CLF_LR = 5e-5
BACKBONE_LR = 5e-6  # very small so it won't overfit so fast
WD = 0.01
BATCH_SIZE = 20
EPOCHS = 35

TRAIN_LOSSI = []
VAL_LOSSI = []

torch.cuda.empty_cache()
MODEL.to(DEVICE);

In [53]:
train_loader = DataLoader(train_triplets, batch_size=BATCH_SIZE, shuffle=True,collate_fn=triplet_collate_fn)
val_loader = DataLoader(tokenized_wprompt_val, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(tokenized_wprompt_test, batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [54]:
def train_loop_triplets(model, train_loader, val_loader, epochs, clf_lr, back_lr, wd, tlossi, vlossi, tf1s, vf1s, save_fname):
    
    optimizer = AdamW([
        {'params': model.classifier.parameters(), 'lr': clf_lr},
        {'params': [p for p in model.roberta.parameters() if p.requires_grad], 'lr': back_lr}
    ], weight_decay=wd)

    load_f1 = load("f1")
    load_accuracy = load("accuracy")
    best_val_metrics_sum = -1

    for epoch in tqdm(range(epochs)):

        # basic lr scheduler
        if epoch > 7:
            clf_lr *= 0.9
            back_lr *= 0.9
        
        model.update_unfreeze_status(epoch)

        new_unfreezed_params = [p for p in model.roberta.parameters() if p.requires_grad]
        
        param_groups = [
            {'params': model.classifier.parameters(), 'lr': clf_lr},
            {'params': new_unfreezed_params, 'lr': back_lr}
        ]
        optimizer = AdamW(param_groups, weight_decay=wd)

        all_preds_train = []
        all_labels_train = []
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            inputs = batch['input_ids'].to(DEVICE)
            masks = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=batch['input_ids'].to(DEVICE),
                attention_mask=batch['attention_mask'].to(DEVICE),
                labels=batch['labels'].to(DEVICE),
                anchor_ids=batch['anchor_ids'].to(DEVICE),
                positive_ids=batch['positive_ids'].to(DEVICE),
                negative_ids=batch['negative_ids'].to(DEVICE),
                #anchor_mask=batch['anchor_mask'].to(DEVICE),
                #positive_mask=batch['positive_mask'].to(DEVICE),
                #negative_mask=batch['negative_mask'].to(DEVICE)
            )
            
            logits = outputs['logits']
            loss = outputs['loss']
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            train_loss += loss.item()

            preds = logits.argmax(dim=-1).detach().cpu().numpy()
            all_preds_train.extend(preds)
            all_labels_train.extend(labels)

        epoch_train_loss = train_loss/len(train_loader)
        epoch_train_f1 = load_f1.compute(predictions=all_preds_train, references=all_labels_train, average='macro')["f1"]
        
        tlossi.append(epoch_train_loss)
        tf1s.append(epoch_train_f1)

        all_preds_val = []
        all_labels_val = []
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch['input_ids'].to(DEVICE)
                masks = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                
                outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
                logits = outputs['logits']
                loss = outputs['loss']
                val_loss += loss.item()
                
                preds = logits.argmax(dim=-1).detach().cpu().numpy()
                all_preds_val.extend(preds)
                all_labels_val.extend(labels)
                
        epoch_val_loss = val_loss/len(val_loader)
        epoch_val_f1 = load_f1.compute(predictions=all_preds_val, references=all_labels_val, average='macro')["f1"]
        epoch_val_acc = load_accuracy.compute(predictions=all_preds_val, references=all_labels_val)["accuracy"]
        vlossi.append(epoch_val_loss)
        vf1s.append(epoch_val_f1)

        val_metrics_sum = epoch_val_f1 + epoch_val_acc
        # save best model
        if val_metrics_sum > best_val_metrics_sum:
            best_val_metrics_sum = val_metrics_sum
            torch.save(model.state_dict(), f'{save_fname}.pth')
            
            
        print(f'Epoch {epoch+1} | Train Loss: {epoch_train_loss:.4f} | Train F1: {epoch_train_f1:.4f} | Val Loss: {epoch_val_loss:.4f} | Val F1: {epoch_val_f1:.4f} | Val Acc: {epoch_val_acc:.4f}')
        torch.cuda.empty_cache()
        

In [None]:
train_lossi = []
train_f1s = []
val_lossi = []
val_f1s = []

s = time.time()
train_loop_triplets(MODEL, train_loader, val_loader, EPOCHS, CLF_LR, BACKBONE_LR, WD, train_lossi, val_lossi, train_f1s, val_f1s, 'aftern_wprompt_tploss_weights')
e = time.time()
print(f'Training completed in {(e-s)/60:.4f} mins')

  0%|          | 0/35 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 1.7778 | Train F1: 0.3008 | Val Loss: 1.2580 | Val F1: 0.3669 | Val Acc: 0.4305


Epoch 2:   0%|          | 0/428 [00:00<?, ?it/s]



Epoch 2 | Train Loss: 1.7332 | Train F1: 0.3593 | Val Loss: 1.2298 | Val F1: 0.4333 | Val Acc: 0.4578


Epoch 3:   0%|          | 0/428 [00:00<?, ?it/s]



Epoch 3 | Train Loss: 1.7243 | Train F1: 0.3749 | Val Loss: 1.2256 | Val F1: 0.4076 | Val Acc: 0.4550


Epoch 4:   0%|          | 0/428 [00:00<?, ?it/s]



Epoch 4 | Train Loss: 1.7067 | Train F1: 0.3761 | Val Loss: 1.2221 | Val F1: 0.4486 | Val Acc: 0.4532


Epoch 5:   0%|          | 0/428 [00:00<?, ?it/s]



Epoch 5 | Train Loss: 1.7045 | Train F1: 0.3726 | Val Loss: 1.2263 | Val F1: 0.4569 | Val Acc: 0.4596


Epoch 6:   0%|          | 0/428 [00:00<?, ?it/s]



Epoch 6 | Train Loss: 1.7021 | Train F1: 0.3892 | Val Loss: 1.2118 | Val F1: 0.4475 | Val Acc: 0.4614


Epoch 7:   0%|          | 0/428 [00:00<?, ?it/s]



Epoch 7 | Train Loss: 1.6949 | Train F1: 0.3832 | Val Loss: 1.1924 | Val F1: 0.4407 | Val Acc: 0.4469


Epoch 8:   0%|          | 0/428 [00:00<?, ?it/s]



Epoch 8 | Train Loss: 1.6913 | Train F1: 0.3914 | Val Loss: 1.2071 | Val F1: 0.4523 | Val Acc: 0.4559
unfreezing all


Epoch 9:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 9 | Train Loss: 1.3516 | Train F1: 0.4612 | Val Loss: 1.0697 | Val F1: 0.4172 | Val Acc: 0.5150


Epoch 10:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 1.2108 | Train F1: 0.5132 | Val Loss: 1.0982 | Val F1: 0.5129 | Val Acc: 0.5132


Epoch 11:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 11 | Train Loss: 1.1192 | Train F1: 0.5519 | Val Loss: 1.0877 | Val F1: 0.5455 | Val Acc: 0.5604


Epoch 12:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 12 | Train Loss: 1.0527 | Train F1: 0.5875 | Val Loss: 1.1796 | Val F1: 0.5242 | Val Acc: 0.5395


Epoch 13:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 13 | Train Loss: 0.9937 | Train F1: 0.6115 | Val Loss: 1.1313 | Val F1: 0.5378 | Val Acc: 0.5513


Epoch 14:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 14 | Train Loss: 0.9348 | Train F1: 0.6488 | Val Loss: 1.2247 | Val F1: 0.5495 | Val Acc: 0.5486


Epoch 15:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 15 | Train Loss: 0.8796 | Train F1: 0.6658 | Val Loss: 1.1949 | Val F1: 0.5404 | Val Acc: 0.5568


Epoch 16:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 16 | Train Loss: 0.8288 | Train F1: 0.6913 | Val Loss: 1.2745 | Val F1: 0.5335 | Val Acc: 0.5304


Epoch 17:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 17 | Train Loss: 0.8008 | Train F1: 0.7042 | Val Loss: 1.3770 | Val F1: 0.5493 | Val Acc: 0.5568


Epoch 18:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 18 | Train Loss: 0.7584 | Train F1: 0.7247 | Val Loss: 1.4148 | Val F1: 0.5433 | Val Acc: 0.5477


Epoch 19:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 19 | Train Loss: 0.7257 | Train F1: 0.7433 | Val Loss: 1.4755 | Val F1: 0.5393 | Val Acc: 0.5431


Epoch 20:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 20 | Train Loss: 0.6845 | Train F1: 0.7510 | Val Loss: 1.4438 | Val F1: 0.5456 | Val Acc: 0.5495


Epoch 21:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 21 | Train Loss: 0.6605 | Train F1: 0.7628 | Val Loss: 1.5174 | Val F1: 0.5420 | Val Acc: 0.5450


Epoch 22:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 22 | Train Loss: 0.6332 | Train F1: 0.7700 | Val Loss: 1.5904 | Val F1: 0.5327 | Val Acc: 0.5341


Epoch 23:   0%|          | 0/428 [00:00<?, ?it/s]

Epoch 23 | Train Loss: 0.6216 | Train F1: 0.7812 | Val Loss: 1.6359 | Val F1: 0.5253 | Val Acc: 0.5268


Epoch 24:   0%|          | 0/428 [00:00<?, ?it/s]

In [None]:
# блин, забрали у меня гпу компьют, удалили мой чекпоинт и не дали посчитать метрики на тесте, а вернут гпу только через несколько дней..
# но судя по метрикам на обучении, качество должно быть сравнимым с версией модели без триплет лосса, может чуть лучше
# моё скромное предположение что аккураси на тесте была бы ~0.5727

# в колабе уместил в гпу attention_mask для всех инпутов в триплет лоссе, но там комьют забрали ещё быстрее)
# с масками возможно удалось бы добиться качества повыше, эх

# Paraphraze augmentations?