# setup

In [65]:
import os
from tqdm.auto import tqdm
import time
import numpy as np
import pandas as pd
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, TrainerCallback, AutoConfig, AutoModel, DataCollatorWithPadding, get_linear_schedule_with_warmup
from peft import PromptTuningConfig, PromptTuningInit, PeftModelForSequenceClassification, LoraConfig
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW

MY_SEED = 59
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 16
MAX_LENGTH = 256
PROB_LR = 5e-3

In [8]:
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# data

In [9]:
dataset = load_dataset('dair-ai/emotion')
labels_list = dataset['train'].features['label'].names
label2id = {k:v for v, k in enumerate(labels_list)}
id2label = {v:k for v, k in enumerate(labels_list)}
print(dataset)
print(label2id)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}


In [10]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [11]:
def tokenize_function(examples):
    return tokenizer(examples['text'], max_length=MAX_LENGTH, padding='max_length', truncation=True, return_tensors='pt')
    
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns='text')
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# check

In [5]:
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=6)
model.to(DEVICE);

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def compute_metrics(model, tokenized_dataset, split, labels_list, device, batch_size):

    dataset = tokenized_dataset[split]
    model.to(device)
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        
        inputs = {
            'input_ids': torch.tensor(batch['input_ids']).to(device),
            'attention_mask': torch.tensor(batch['attention_mask']).to(device)
        }
        labels = torch.tensor(batch['labels']).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs['logits'] if isinstance(outputs, dict) else outputs
            predictions = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    return classification_report(
        all_labels,
        all_predictions,
        target_names=labels_list,
        zero_division=0
    )

In [16]:
base_metrics = compute_metrics(model, tokenized_dataset, 'test', labels_list, DEVICE, BATCH_SIZE)
print(f'before training metrics:\n {base_metrics}')

  0%|          | 0/125 [00:00<?, ?it/s]

before training metrics:
               precision    recall  f1-score   support

     sadness       0.30      0.76      0.43       581
         joy       0.00      0.00      0.00       695
        love       0.10      0.18      0.13       159
       anger       0.10      0.05      0.07       275
        fear       0.15      0.04      0.07       224
    surprise       0.00      0.00      0.00        66

    accuracy                           0.25      2000
   macro avg       0.11      0.17      0.12      2000
weighted avg       0.12      0.25      0.15      2000



# fine-tune

In [17]:
def compute_eval_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, average='macro'),
        'recall': recall_score(labels, predictions, average='macro'),
        'f1': f1_score(labels, predictions, average='macro'),
    }

In [18]:
def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f'trainable parameters: {trainable_params:,}')
    print(f'total parameters: {total_params:,}')
    return trainable_params

In [10]:
print_trainable_parameters(model);

trainable parameters: 109,486,854
total parameters: 109,486,854


In [70]:
training_args = TrainingArguments(
    output_dir='./models',
    eval_strategy='epoch',
    logging_strategy='steps',
    logging_steps=25,
    save_strategy='epoch',
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_dir='./logs',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    save_total_limit=5,
    seed=MY_SEED,
    report_to='none',
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_eval_metrics,
    #callbacks=[LossLoggingCallback()],
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [34]:
torch.cuda.empty_cache()
start_time = time.time()
torch.cuda.reset_peak_memory_stats()

trainer.train()
training_time = time.time() - start_time
mem_usage = torch.cuda.max_memory_allocated() / (1024 * 1024)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.196,0.191988,0.931,0.930979,0.87624,0.899049
2,0.113,0.167218,0.936,0.916049,0.909087,0.909891
3,0.1512,0.213423,0.9295,0.912006,0.903183,0.905983
4,0.0495,0.263794,0.9285,0.904133,0.900045,0.90065
5,0.0643,0.268314,0.944,0.925033,0.917285,0.92069
6,0.022,0.354857,0.935,0.905201,0.90997,0.907333
7,0.0172,0.360507,0.9405,0.916994,0.912228,0.914499
8,0.0217,0.404772,0.9395,0.922165,0.907738,0.91451
9,0.0078,0.411763,0.941,0.917558,0.911605,0.914533
10,0.014,0.43065,0.9385,0.913446,0.908424,0.910809


In [None]:
print(f'gpu max memory util: {mem_usage:.4f} mb')
print(f'train time: {(training_time/60):.4f} mins')

gpu max memory util: 4686.5547 mb
train time: 73.9623 mins


  0%|          | 0/125 [00:00<?, ?it/s]

In [None]:
finetune_metrics = compute_metrics(model, tokenized_dataset, 'test', labels_list, DEVICE, BATCH_SIZE)
print(f'after full fine-tune metrics:\n {finetune_metrics}')

# linear-probing

In [41]:
class BertBetterHead(nn.Module):
    def __init__(self, model_name='google-bert/bert-base-uncased', num_labels=6):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.loss_fn = nn.CrossEntropyLoss()
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)
        
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(config.hidden_size, 512),
            nn.LayerNorm(512),
            nn.LeakyReLU(),
            nn.Linear(512, num_labels)
        )

        self.init_weights()

    def init_weights(self):
        for module in self.head.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, std=0.02)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
        
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embed = outputs.last_hidden_state[:, 0]
        logits = self.head(embed)

        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
            return {'loss': loss, 'logits': logits}
        
        return {'logits': logits}

Сделал два слоя в голове с LeakyReLU, чтобы модель могла находить нелинейные зависимости, перед активацией добавил LayerNorm, чтобы стабилизировать распределения логитов, особенно в самом начале обучения, когда веса рандомные, также добавил Dropout, т.к. модель обучаем глубокую, может быстро переобучиться.

In [42]:
model = BertBetterHead()
model.to(DEVICE);

In [43]:
print_trainable_parameters(model);

trainable parameters: 397,830
total parameters: 109,880,070


In [44]:
training_args = TrainingArguments(
    output_dir='./models',
    eval_strategy='epoch',
    logging_strategy='steps',
    logging_steps=25,
    learning_rate=PROB_LR,
    save_strategy='epoch',
    num_train_epochs=20,
    per_device_train_batch_size=BATCH_SIZE*2,
    per_device_eval_batch_size=BATCH_SIZE*2,
    logging_dir='./logs',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    save_total_limit=5,
    seed=MY_SEED,
    report_to='none',
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_eval_metrics,
)

In [45]:
torch.cuda.empty_cache()
start_time = time.time()
torch.cuda.reset_peak_memory_stats()

trainer.train()
training_time = time.time() - start_time
mem_usage = torch.cuda.max_memory_allocated() / (1024 * 1024)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2946,1.206687,0.5515,0.362292,0.356712,0.334277
2,1.2258,1.15427,0.5685,0.437103,0.399231,0.394457
3,1.1903,1.135977,0.584,0.526561,0.440907,0.44663
4,1.1323,1.130084,0.571,0.586067,0.405753,0.402018
5,1.176,1.093585,0.5885,0.557691,0.412124,0.417679
6,1.1491,1.097489,0.577,0.613276,0.383644,0.389961
7,1.1936,1.116721,0.5785,0.508712,0.388048,0.380577
8,1.1055,1.084485,0.5855,0.570297,0.433671,0.434733
9,1.1274,1.089316,0.571,0.578818,0.40405,0.414094
10,1.0857,1.062843,0.5865,0.564909,0.425229,0.437346


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
print(f'gpu max memory util: {mem_usage:.4f} mb')
print(f'train time: {(training_time/60):.4f} mins')

gpu max memory util: 1315.3203 mb
train time: 46.9595 mins


In [47]:
probing_metrics = compute_metrics(model, tokenized_dataset, 'test', labels_list, DEVICE, BATCH_SIZE)
print(f'linear-probing metrics:\n {probing_metrics}')

  0%|          | 0/125 [00:00<?, ?it/s]

linear-probing metrics:
               precision    recall  f1-score   support

     sadness       0.57      0.72      0.64       581
         joy       0.66      0.79      0.72       695
        love       0.45      0.21      0.28       159
       anger       0.58      0.37      0.45       275
        fear       0.55      0.40      0.46       224
    surprise       0.53      0.24      0.33        66

    accuracy                           0.60      2000
   macro avg       0.56      0.45      0.48      2000
weighted avg       0.59      0.60      0.58      2000



# prompt-tuning

Prompt-tuning проще в реализации и менее вычислительно нагружен, для нашей простой задачи классификации в самый раз.

In [49]:
bert_base = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', 
                                                               num_labels=6, return_dict=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
peft_config = PromptTuningConfig(
    task_type='SEQ_CLS',
    num_virtual_tokens=20,
    token_dim=768,
    num_transformer_submodules=1,
    num_attention_heads=12,
    num_layers=12,
    prompt_tuning_init='TEXT',
    prompt_tuning_init_text='Classify the emotion of this text:',
    tokenizer_name_or_path='google-bert/bert-base-uncased',
)

In [51]:
model = PeftModelForSequenceClassification(bert_base, peft_config)
model.to(DEVICE)
print_trainable_parameters(model);

trainable parameters: 19,974
total parameters: 109,506,828


In [58]:
data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(tokenized_dataset['train'], shuffle=True, collate_fn=data_collator, batch_size=BATCH_SIZE*2)
valid_dataloader = DataLoader(tokenized_dataset['validation'], shuffle=False, collate_fn=data_collator, batch_size=BATCH_SIZE*2)

In [60]:
num_epochs = 8
criterion = torch.nn.CrossEntropyLoss()

optimizer = AdamW(params=model.parameters(), lr=PROB_LR)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

Почему-то peft криво работает с trainer в transformers, пришлось писать трейнлуп

In [61]:
torch.cuda.empty_cache()
start_time = time.time()
torch.cuda.reset_peak_memory_stats()

best_f1 = 0
best_model_state = None

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    all_train_preds = []
    all_train_labels = []
    
    for batch in tqdm(train_dataloader):
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs['logits']
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        total_train_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        all_train_preds.extend(preds.cpu().numpy())
        all_train_labels.extend(labels.cpu().numpy())
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_acc = accuracy_score(all_train_labels, all_train_preds)
    train_f1 = f1_score(all_train_labels, all_train_preds, average='macro')
    
    model.eval()
    total_eval_loss = 0
    all_eval_preds = []
    all_eval_labels = []
    
    for batch in tqdm(valid_dataloader):
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(DEVICE)
        
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs['logits']
        loss = criterion(logits, labels)
        total_eval_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        all_eval_preds.extend(preds.cpu().numpy())
        all_eval_labels.extend(labels.cpu().numpy())
    
    avg_eval_loss = total_eval_loss / len(valid_dataloader)
    eval_acc = accuracy_score(all_eval_labels, all_eval_preds)
    eval_f1 = f1_score(all_eval_labels, all_eval_preds, average='macro')
    
    if eval_f1 > best_f1:
        best_f1 = eval_f1
        best_model_state = model.state_dict()
        torch.save(best_model_state, f'best_model_epoch_{epoch}.pt')
    
    print(f"Train Loss: {avg_train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
    print(f"Val Loss: {avg_eval_loss:.4f} | Acc: {eval_acc:.4f} | F1: {eval_f1:.4f}")

training_time = time.time() - start_time
mem_usage = torch.cuda.max_memory_allocated() / (1024 ** 2)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.5817 | Acc: 0.7962 | F1: 0.7406
Val Loss: 0.4232 | Acc: 0.8585 | F1: 0.8307


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.5009 | Acc: 0.8246 | F1: 0.7781
Val Loss: 0.3773 | Acc: 0.8785 | F1: 0.8496


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.4483 | Acc: 0.8448 | F1: 0.8029
Val Loss: 0.3282 | Acc: 0.8930 | F1: 0.8714


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.4073 | Acc: 0.8559 | F1: 0.8160
Val Loss: 0.3594 | Acc: 0.8855 | F1: 0.8588


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.3788 | Acc: 0.8668 | F1: 0.8302
Val Loss: 0.3023 | Acc: 0.8905 | F1: 0.8638


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.3498 | Acc: 0.8756 | F1: 0.8405
Val Loss: 0.3036 | Acc: 0.8930 | F1: 0.8692


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.3465 | Acc: 0.8764 | F1: 0.8387
Val Loss: 0.2668 | Acc: 0.8995 | F1: 0.8731


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 0.3179 | Acc: 0.8861 | F1: 0.8477
Val Loss: 0.2586 | Acc: 0.9040 | F1: 0.8815


In [62]:
print(f'gpu max memory util: {mem_usage:.4f} mb')
print(f'train time: {(training_time/60):.4f} mins')

gpu max memory util: 7885.7739 mb
train time: 51.0354 mins


# lora

In [78]:
bert_base = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', 
                                                               num_labels=6, return_dict=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Параметры особо не подбирал, просто поставил R чуть больше, чем дефолтные 8, увеличил alpha, чтобы эффект от лоры было лучше видно (будь он положительным или отрицательным), добавил dropout чтобы не переобучиться, т.к. адаптируем сразу все attention модули.

In [79]:
lora_config = LoraConfig(
    task_type='SEQ_CLS',
    r=12,
    lora_alpha=24,
    bias='lora_only',
    lora_dropout=0.1,
    target_modules=['query', 'value', 'key'],
)

In [80]:
model = PeftModelForSequenceClassification(bert_base, lora_config)
model.to(DEVICE)
print_trainable_parameters(model);

trainable parameters: 695,814
total parameters: 110,155,020


In [81]:
data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(tokenized_dataset['train'], shuffle=True, collate_fn=data_collator, batch_size=BATCH_SIZE*2)
valid_dataloader = DataLoader(tokenized_dataset['validation'], shuffle=False, collate_fn=data_collator, batch_size=BATCH_SIZE*2)

In [82]:
num_epochs = 7
criterion = torch.nn.CrossEntropyLoss()

optimizer = AdamW(params=model.parameters(), lr=PROB_LR)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [83]:
torch.cuda.empty_cache()
start_time = time.time()
torch.cuda.reset_peak_memory_stats()

best_f1 = 0
best_model_state = None

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    all_train_preds = []
    all_train_labels = []
    
    for batch in tqdm(train_dataloader):
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs['logits']
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        total_train_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        all_train_preds.extend(preds.cpu().numpy())
        all_train_labels.extend(labels.cpu().numpy())
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_acc = accuracy_score(all_train_labels, all_train_preds)
    train_f1 = f1_score(all_train_labels, all_train_preds, average='macro')
    
    model.eval()
    total_eval_loss = 0
    all_eval_preds = []
    all_eval_labels = []
    
    for batch in tqdm(valid_dataloader):
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(DEVICE)
        
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs['logits']
        loss = criterion(logits, labels)
        total_eval_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        all_eval_preds.extend(preds.cpu().numpy())
        all_eval_labels.extend(labels.cpu().numpy())
    
    avg_eval_loss = total_eval_loss / len(valid_dataloader)
    eval_acc = accuracy_score(all_eval_labels, all_eval_preds)
    eval_f1 = f1_score(all_eval_labels, all_eval_preds, average='macro')
    
    if eval_f1 > best_f1:
        best_f1 = eval_f1
        best_model_state = model.state_dict()
        torch.save(best_model_state, f'best_model_epoch_{epoch}.pt')
    
    print(f"Train Loss: {avg_train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
    print(f"Val Loss: {avg_eval_loss:.4f} | Acc: {eval_acc:.4f} | F1: {eval_f1:.4f}")

training_time = time.time() - start_time
mem_usage = torch.cuda.max_memory_allocated() / (1024 ** 2)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 1.5422 | Acc: 0.3641 | F1: 0.1801
Val Loss: 1.6208 | Acc: 0.2750 | F1: 0.0719


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

Train Loss: 1.6024 | Acc: 0.3139 | F1: 0.1244
Val Loss: 1.6021 | Acc: 0.3520 | F1: 0.0868


  0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [87]:
mem_usage = torch.cuda.max_memory_allocated() / (1024 ** 2)
mem_usage

8526.68798828125

# summary

| training method | f1 macro | train params. | gpu usage (mb) | train time (mins) | epochs |
| --- | --- | --- | --- | --- | --- |
| without training | 0.06 | - | - | - | - |
| full fine-tuning | 0.921 | 109,486,854 | 4686.5 | 73.9 | 10 |
| linear probing | 0.48  | 398,342 | 1315.3 | 46 | 20 |
| prompt-tuning | 0.881 | 19,974 | 7885 | 51 | 8 |
| lora | 0.086 | 695,814 | 8526 | 12 | 2 |

Полный файн-тюнинг - относительно хорошие метрики, долгое обучение и потребление ресурсов.

Кастомная голова - метрики сильно ниже, возможно из-за того, что используем нативный берт на задачу, связанную с эмоциями.

Prompt-tuning - медленное обучение (возможно из-за dataloader'ов), мало обучаемых параметров, метрики выше, чем на кастомной голове, F1 сравним с полным тюнингом. Если подобрать параметры лучше и поучить чуть дольше, возможно превзойдет полный тюнинг.

LoRA - слегка по таймингу не успел обучить, упс, минус баллы. Но по первым эпохам динамика довольно странная, как будто она плохо обучится, скорее всего ошибка в том, что я в lora_modules указал key value queue вместо просто attention