## Notebook for models

Overall plan is to train 4 models:
- SVM on top of the herbert-large-uncased
- From scratch model enkoder-only with size similar herbert-large
- Finetunning on herbert-large with add of classification head to the model
- Finetunning herber large: contrastive learning on the meaning -> classification finetune 

### Imports & setup

In [1]:
import os
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from torch.utils.data import Dataset
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.nn.functional as F

In [2]:
!nvidia-smi

Wed Nov 26 19:51:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:87:00.0 Off |                    0 |
| N/A   30C    P0             54W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!export HF_HOME="/net/tscratch/people/plgboksa/lab3_computional_ling_better_final/model"
!export HF_DATASETS_CACHE="/net/tscratch/people/plgboksa/lab3_computional_ling_better_final/cache"
!export HF_METRICS_CACHE="/net/tscratch/people/plgboksa/lab3_computional_ling_better_final/cache"

In [4]:
MODEL_NAME = "allegro/herbert-large-cased"
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device: {device}")

Device: cuda


### Datasets preparation

In [5]:
class SlangClassificationDataset(Dataset):
    def __init__(self, path, tokenizer, max_len=128):
        self.df = pd.read_csv(path)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        text = row["text"]
        label = row["label"]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [6]:
train_datast = SlangClassificationDataset("prepared_data/train.csv", tokenizer)
val_datast = SlangClassificationDataset("prepared_data/val.csv", tokenizer)

print(f"Train dataset size: {len(train_datast)}")
print(f"Validation dataset size: {len(val_datast)}")

Train dataset size: 4336
Validation dataset size: 542


In [7]:
test_dataset = SlangClassificationDataset("prepared_data/test.csv", tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [8]:
def verify_model(model, dataset, batch_size):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            probs = model.forward(input_ids, attention_mask)
            y_pred = torch.argmax(probs, dim=1)

            all_preds.append(y_pred.cpu())
            all_labels.append(labels.cpu())

    y_pred_cpu = torch.cat(all_preds).numpy()
    y_true = torch.cat(all_labels).numpy()

    return {
        "f1": f1_score(y_true, y_pred_cpu, average="weighted"),
        "accuracy": accuracy_score(y_true, y_pred_cpu),
    }


def inference_with_timing(model, dataloader, device=None, desc="Inference"):
    all_preds = []
    batch_times = []

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.eval()

    for batch in tqdm(dataloader, desc=desc, unit="batch"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        start_batch = time.time()
        probs = model.forward(input_ids, attention_mask)
        y_pred = torch.argmax(probs, dim=1)
        end_batch = time.time()

        batch_times.append(end_batch - start_batch)
        all_preds.append(y_pred.cpu())

    avg_batch_time = sum(batch_times) / len(batch_times)

    y_pred_all = torch.cat(all_preds)
    return y_pred_all, avg_batch_time

### 1. HerBERTSVM

Attached SVM on top of the Herbert embeddings.

In [12]:
class HerBERTSVM(nn.Module):
    def __init__(
        self, model_name="allegro/herbert-large-cased", svm_kernel="linear", svm_C=1.0
    ):
        super().__init__()
        self.device = device

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

        for param in self.model.parameters():
            param.requires_grad = False

        self.clf = SVC(kernel=svm_kernel, C=svm_C, probability=True)

    def encode_texts(self, input_ids, attention_mask, batch_size=16) -> np.ndarray:
        all_embeddings = []

        for i in range(0, input_ids.size(0), batch_size):
            batch_input_ids = input_ids[i : i + batch_size].to(self.device)
            batch_attention_mask = attention_mask[i : i + batch_size].to(self.device)

            with torch.no_grad():
                outputs = self.model(
                    input_ids=batch_input_ids, attention_mask=batch_attention_mask
                )
                cls_embeddings = outputs.last_hidden_state[:, 0, :]

            all_embeddings.append(cls_embeddings.cpu().numpy())

        return np.vstack(all_embeddings)

    def predict(self, input_ids, attention_mask) -> np.ndarray:
        X = self.encode_texts(input_ids, attention_mask)
        return self.clf.predict(X)

    def predict_proba(self, input_ids, attention_mask) -> torch.Tensor:
        X = self.encode_texts(input_ids, attention_mask)
        probs = self.clf.predict_proba(X)
        return torch.tensor(probs, dtype=torch.float32)

    def forward(self, input_ids, attention_mask, batch_size=16) -> torch.Tensor:
        return self.predict_proba(input_ids, attention_mask)

In [18]:
herbert_svm = HerBERTSVM()

Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### HerBERTSVM training

In [None]:
batch_size = 16
train_loader = DataLoader(train_datast, batch_size=batch_size, shuffle=False)

all_embeddings = []
all_labels = []


print("Start training HerBERT + SVM model")
start_time = time.time()

with torch.no_grad():
    for batch in tqdm(train_loader, desc="Extracting CLS embeddings", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]

        outputs = herbert_svm.model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]

        all_embeddings.append(cls_embeddings.cpu().numpy())
        all_labels.append(labels.cpu().numpy())


X_train = np.vstack(all_embeddings)
y_train = np.concatenate(all_labels)

print("Start SVM Fit")
herbert_svm.clf.fit(X_train, y_train)

end_time = time.time()
print(f"Training lasts for: {end_time - start_time:.2f} seconds")

Start training HerBERT + SVM model


Extracting CLS embeddings: 100%|████████████████████████████████████████████████████████████████████████████████████| 271/271 [00:23<00:00, 11.30batch/s]


Start SVM Fit
Training lasts for: 62.30 seconds


### Test SVM on evaluation set

In [21]:
svm_metrics = verify_model(herbert_svm, val_datast, batch_size=16)
print(f"SVM Validation F1 Score: {svm_metrics['f1']:.4f}")
print(f"SVM Validation Accuracy: {svm_metrics['accuracy']:.4f}")

SVM Validation F1 Score: 0.6491
SVM Validation Accuracy: 0.6605


SVM was trained for 62.30s with embedding creation.

It was able to reach accuracy of 0.6605 and f1_Score of 0.6491.

Not bad, not bad.

Let's test inference.

In [27]:
test_svm_text = "bambik z ciebie jest leszczu"

tok = herbert_svm.tokenizer(
    test_svm_text,
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt",
)
input_ids = tok["input_ids"]
attention_mask = tok["attention_mask"]

probs = herbert_svm.forward(input_ids=input_ids, attention_mask=attention_mask)
print(torch.argmax(probs, dim=1).item())

0


### Test data evaluation



In [30]:
svm_metrics_test = verify_model(herbert_svm, test_dataset, batch_size=16)
print(f"SVM Test F1 Score: {svm_metrics_test['f1']:.4f}")
print(f"SVM Test Accuracy: {svm_metrics_test['accuracy']:.4f}")

SVM Test F1 Score: 0.6198
SVM Test Accuracy: 0.6354


**inference_time** for batch of size 16

In [34]:
y_pred_all, avg_batch_time = inference_with_timing(
    herbert_svm, test_loader, device=device
)

print(f"**inference_time** for batch of size 16: {avg_batch_time:.4f} seconds")
print(
    f"Test accuracy: {accuracy_score(test_dataset.df['label'], y_pred_all.numpy()):.4f}"
)
print(
    f"Test f1: {f1_score(test_dataset.df['label'], y_pred_all.numpy(), average='weighted'):.4f}"
)

Inference: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:03<00:00,  8.94batch/s]

Average inference time per batch: 0.1030 seconds
**inference_time** for batch of size 16: 0.1030 seconds
Test accuracy: 0.6354
Test f1: 0.6198





**summary for herbert_svm**

Training time: 62.30s

Average inference_time per batch of size 16: 0.1030 seconds

Test accuracy: 0.6354

Test f1: 0,6198

Let's move to the neural networks for the task



### Neural networks helper

In [9]:
class TrainingMetrics:
    def __init__(self):
        self.train_losses = []
        self.val_losses = []
        self.val_accuracies = []
        self.val_f1s = []

    def insert(self, train_loss, val_loss, val_accuracy, val_f1):
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.val_accuracies.append(val_accuracy)
        self.val_f1s.append(val_f1)

    def to_pd(self):
        df = pd.DataFrame(
            {
                "train_loss": self.train_losses,
                "val_loss": self.val_losses,
                "val_accuracy": self.val_accuracies,
                "val_f1": self.val_f1s,
            }
        )
        return df

In [14]:
def evaluate(model: nn.Module, val_loader: DataLoader, loss_fn) -> dict:
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            y_pred = torch.argmax(outputs, dim=1)
            all_preds.append(y_pred.cpu())
            all_labels.append(labels.cpu())

    avg_loss = total_loss / len(val_loader)
    y_pred_cpu = torch.cat(all_preds).numpy()
    y_true = torch.cat(all_labels).numpy()

    accuracy = accuracy_score(y_true, y_pred_cpu)
    f1 = f1_score(y_true, y_pred_cpu, average="weighted")

    return {
        "val_loss": avg_loss,
        "val_accuracy": accuracy,
        "val_f1": f1,
    }


def train(
    model: nn.Module,
    train_dataset: Dataset,
    val_dataset: Dataset,
    optimizer: torch.optim.Optimizer = None,
    epochs: int = 3,
    batch_size: int = 16,
    lr: float = 1e-4,
    log_freq: int = 10,
):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    if optimizer is None:
        optimizer = AdamW(model.parameters(), lr=lr, weight_decay=1e-2)
    loss_fn = nn.CrossEntropyLoss()
    metrics = TrainingMetrics()

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0.0

        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            if (batch_idx + 1) % log_freq == 0:
                avg_train_loss = total_train_loss / log_freq
                total_train_loss = 0.0

                eval_metrics = evaluate(model, val_loader, loss_fn)
                print(
                    f"Batch {batch_idx + 1} - Avg Train Loss: {avg_train_loss:.4f} "
                    f"Validation Loss: {eval_metrics['val_loss']:.4f}, "
                    f"Accuracy: {eval_metrics['val_accuracy']:.4f}, "
                    f"F1 Score: {eval_metrics['val_f1']:.4f}"
                )
                metrics.insert(
                    avg_train_loss,
                    eval_metrics["val_loss"],
                    eval_metrics["val_accuracy"],
                    eval_metrics["val_f1"],
                )

        print(f"Epoch {epoch + 1} completed, training metrics recorded.")

    return metrics

### Training enkoder-only model from scratch

In [15]:
class ScratchClassifier(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim=1024,
        num_heads=16,
        hidden_dim=4096,
        num_layers=24,
        max_len=128,
        num_labels=3,
        dropout=0.2,
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_dim))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(embed_dim, num_labels)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        x = x + self.pos_embedding[:, : x.size(1), :]

        if attention_mask is not None:
            src_key_padding_mask = ~attention_mask.bool()
        else:
            src_key_padding_mask = None

        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        cls_token = x[:, 0, :]
        cls_token = self.dropout(cls_token)

        logits = self.classifier(cls_token)
        return logits

In [45]:
model = ScratchClassifier(vocab_size=tokenizer.vocab_size, num_labels=3)
model.to(device)
total_params = sum(p.numel() for p in model.parameters())
print(f"Model has {total_params:,} parameters")


Model has 353,643,523 parameters


In [None]:
start_time = time.time()

training_metrics = train(
    model, train_datast, val_datast, epochs=3, batch_size=16, lr=1e-5, log_freq=10
)

end_time = time.time()
total_training_time = end_time - start_time
print(f"Total training time: {total_training_time:.2f} seconds")

Batch 10 - Avg Train Loss: 1.1797 Validation Loss: 1.0338, Accuracy: 0.5000, F1 Score: 0.3521
Batch 20 - Avg Train Loss: 0.9945 Validation Loss: 1.0645, Accuracy: 0.3930, F1 Score: 0.3478
Batch 30 - Avg Train Loss: 1.0398 Validation Loss: 1.1414, Accuracy: 0.5037, F1 Score: 0.3374
Batch 40 - Avg Train Loss: 1.0603 Validation Loss: 1.0252, Accuracy: 0.5037, F1 Score: 0.3374
Batch 50 - Avg Train Loss: 1.1085 Validation Loss: 1.0558, Accuracy: 0.4742, F1 Score: 0.3994
Batch 60 - Avg Train Loss: 1.0511 Validation Loss: 1.0368, Accuracy: 0.5037, F1 Score: 0.3374
Batch 70 - Avg Train Loss: 1.0546 Validation Loss: 1.0221, Accuracy: 0.5074, F1 Score: 0.4170
Batch 80 - Avg Train Loss: 1.0815 Validation Loss: 1.0001, Accuracy: 0.5258, F1 Score: 0.3872
Batch 90 - Avg Train Loss: 1.0435 Validation Loss: 1.0295, Accuracy: 0.3469, F1 Score: 0.2632
Batch 100 - Avg Train Loss: 1.0424 Validation Loss: 1.0174, Accuracy: 0.5037, F1 Score: 0.3374
Batch 110 - Avg Train Loss: 0.9419 Validation Loss: 1.0043,

In [48]:
training_metrics.to_pd().to_csv(
    "results/scratch_model_training_metrics.csv", index=False
)

y_pred_all, avg_batch_time = inference_with_timing(model, test_loader, device=device)

print(f"**inference_time** for batch of size 16: {avg_batch_time:.4f} seconds")
print(
    f"Test accuracy: {accuracy_score(test_dataset.df['label'], y_pred_all.numpy()):.4f}"
)
print(
    f"Test f1: {f1_score(test_dataset.df['label'], y_pred_all.numpy(), average='weighted'):.4f}"
)

Inference: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:03<00:00,  9.89batch/s]


**inference_time** for batch of size 16: 0.0216 seconds
Test accuracy: 0.5230
Test f1: 0.5289


**summary for scratch**

Training time: 325.61s

Average inference_time per batch of size 16: 0.0216 seconds

Test accuracy: 0.5230

Test f1: 0.5289

In [54]:
test_text = "bambik z ciebie jest leszczu"

tok = tokenizer(
    test_text,
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt",
)
input_ids = tok["input_ids"]
attention_mask = tok["attention_mask"]

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
model.to(device)

probs = model(input_ids=input_ids, attention_mask=attention_mask)
print(torch.argmax(probs, dim=1).item())

0


### Finetuning model for classification

In [11]:
class FinetuneModel(nn.Module):
    def __init__(
        self,
        model_name="allegro/herbert-large-cased",
        num_labels=3,
        dropout=0.3,
    ):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name, device_map=None)
        hidden_size = self.model.config.hidden_size

        for param in self.model.parameters():
            param.requires_grad = False

        for layer_idx in range(8, 12):
            for param in self.model.encoder.layer[layer_idx].parameters():
                param.requires_grad = True

        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)

        if isinstance(out, tuple):
            hidden_state = out[0]
        else:
            hidden_state = out.last_hidden_state

        cls = hidden_state[:, 0, :]
        x = self.dropout(cls)
        logits = self.classifier(x)

        return logits

In [12]:
model = FinetuneModel()
model.to(device)

Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FinetuneModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-1

In [None]:
start_time = time.time()

training_metrics = train(
    model, train_datast, val_datast, epochs=3, batch_size=16, lr=1e-5, log_freq=10
)

end_time = time.time()
total_training_time = end_time - start_time
print(f"Total training time: {total_training_time:.2f} seconds")

Batch 10 - Avg Train Loss: 1.1202 Validation Loss: 1.0708, Accuracy: 0.5018, F1 Score: 0.3402
Batch 20 - Avg Train Loss: 1.0616 Validation Loss: 1.0460, Accuracy: 0.5037, F1 Score: 0.3374
Batch 30 - Avg Train Loss: 1.0164 Validation Loss: 1.0293, Accuracy: 0.5037, F1 Score: 0.3374
Batch 40 - Avg Train Loss: 1.0127 Validation Loss: 1.0220, Accuracy: 0.5037, F1 Score: 0.3374
Batch 50 - Avg Train Loss: 0.9702 Validation Loss: 1.0194, Accuracy: 0.5037, F1 Score: 0.3374
Batch 60 - Avg Train Loss: 1.0519 Validation Loss: 1.0161, Accuracy: 0.5037, F1 Score: 0.3374
Batch 70 - Avg Train Loss: 1.0277 Validation Loss: 1.0138, Accuracy: 0.5037, F1 Score: 0.3374
Batch 80 - Avg Train Loss: 0.9656 Validation Loss: 1.0109, Accuracy: 0.5037, F1 Score: 0.3374
Batch 90 - Avg Train Loss: 0.9662 Validation Loss: 1.0086, Accuracy: 0.5037, F1 Score: 0.3374
Batch 100 - Avg Train Loss: 1.0041 Validation Loss: 1.0042, Accuracy: 0.5037, F1 Score: 0.3374
Batch 110 - Avg Train Loss: 0.9892 Validation Loss: 0.9985,

In [21]:
training_metrics.to_pd().to_csv(
    "results/finetune_model_training_metrics.csv", index=False
)

y_pred_all, avg_batch_time = inference_with_timing(model, test_loader, device=device)

print(f"**inference_time** for batch of size 16: {avg_batch_time:.4f} seconds")
print(
    f"Test accuracy: {accuracy_score(test_dataset.df['label'], y_pred_all.numpy()):.4f}"
)
print(
    f"Test f1: {f1_score(test_dataset.df['label'], y_pred_all.numpy(), average='weighted'):.4f}"
)

Inference: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:03<00:00, 10.12batch/s]


**inference_time** for batch of size 16: 0.0177 seconds
Test accuracy: 0.7090
Test f1: 0.7092


In [22]:
test_text = "bambik z ciebie jest leszczu"

tok = tokenizer(
    test_text,
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt",
)
input_ids = tok["input_ids"]
attention_mask = tok["attention_mask"]

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
model.to(device)

probs = model(input_ids=input_ids, attention_mask=attention_mask)
print(torch.argmax(probs, dim=1).item())

0


**summary on finetunning:**


Training time: 397.82s

Average inference_time per batch of size 16: 0.0177 seconds

Test accuracy: 0.7090

Test f1: 0.7092

### Additional model

Get as best model as you can.

Plan: 
- finetune bert on contrastive learning with word <-> meaning
- then train classifier

In [16]:
class SlangContrastiveDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        if isinstance(data, str):
            self.df = pd.read_csv(data)
        else:
            self.df = data

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        word = row["word"]
        meaning = row["meaning"]

        word_enc = self.tokenizer(
            word,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        meaning_enc = self.tokenizer(
            meaning,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "word_input_ids": word_enc["input_ids"].squeeze(),
            "word_attention_mask": word_enc["attention_mask"].squeeze(),
            "meaning_input_ids": meaning_enc["input_ids"].squeeze(),
            "meaning_attention_mask": meaning_enc["attention_mask"].squeeze(),
        }

In [11]:
train_df = pd.read_csv("prepared_data/train.csv")
eval_df = pd.read_csv("prepared_data/val.csv")

merged_df = pd.concat([train_df, eval_df], ignore_index=True)

contrastive_learning_dataset = SlangContrastiveDataset(merged_df, tokenizer)

print(f"Contrastive Learning Dataset size: {len(contrastive_learning_dataset)}")

Contrastive Learning Dataset size: 4878


In [17]:
from torch.nn import functional as F


def contrastive_loss(e_word, e_meaning, temperature=0.07):
    sim = torch.matmul(e_word, e_meaning.T)
    sim = sim / temperature

    labels = torch.arange(
        sim.size(0), device=sim.device
    ).long()  # on diagonal good pairs
    loss = F.cross_entropy(sim, labels)
    return loss

In [18]:
def contrastive_loss_training(
    model,
    dataset,
    epochs=3,
    batch_size=16,
    lr=3e-6,
    accumulation_steps=4,
    log_freq=10,
    temperature=0.07,
):
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=1e-2)
    metrics = TrainingMetrics()

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0.0

        for batch_idx, batch in enumerate(train_loader):
            word_ids = batch["word_input_ids"].to(device)
            word_mask = batch["word_attention_mask"].to(device)
            meaning_ids = batch["meaning_input_ids"].to(device)
            meaning_mask = batch["meaning_attention_mask"].to(device)

            e_word = model(word_ids, word_mask)
            e_meaning = model(meaning_ids, meaning_mask)

            loss = contrastive_loss(e_word, e_meaning, temperature=temperature)
            loss = loss / accumulation_steps
            loss.backward()

            if (batch_idx + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()

            total_train_loss += loss.item() * accumulation_steps

            if (batch_idx + 1) % log_freq == 0:
                avg_train_loss = total_train_loss / log_freq
                total_train_loss = 0.0

                print(f"Batch {batch_idx + 1} - Avg Train Loss: {avg_train_loss:.4f} ")
                metrics.insert(avg_train_loss, "", "", "")

        if (batch_idx + 1) % accumulation_steps != 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

        print(f"Epoch {epoch + 1} completed.")

    return metrics

In [19]:
def save_contrastive_model(model, tokenizer=None, path="models"):
    os.makedirs(path, exist_ok=True)

    torch.save(model.state_dict(), f"{path}/contrastive_finetunned.pt")

    if tokenizer is not None:
        tokenizer.save_pretrained(path)

    print(f"Model saved to: {path}")


def load_contrastive_model(model_class, path="models", tokenizer_class=None):
    model = model_class()
    model.load_state_dict(
        torch.load(os.path.join(path, "contrastive_finetunned.pt"), map_location="cpu")
    )

    tokenizer = None
    if tokenizer_class is not None:
        tokenizer = tokenizer_class.from_pretrained(path)

    print(f"Model loaded from: {path}")
    if tokenizer is not None:
        print("Tokenizer loaded as well.")

    return model, tokenizer

In [20]:
class ConstrastiveFinetuneModel(nn.Module):
    def __init__(
        self,
        model_name="allegro/herbert-large-cased",
        dropout=0.15,
    ):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        hidden_size = self.model.config.hidden_size

        for param in self.model.parameters():
            param.requires_grad = False

        for layer_idx in range(8, 12):
            for param in self.model.encoder.layer[layer_idx].parameters():
                param.requires_grad = True

        self.projection = nn.Sequential(nn.Linear(hidden_size, 256), nn.LayerNorm(256))

    def forward(self, input_ids, attention_mask):
        last_hidden = self.model(
            input_ids=input_ids, attention_mask=attention_mask
        ).last_hidden_state
        cls_emb = last_hidden[:, 0, :]
        projected = self.projection(cls_emb)
        normalized = F.normalize(projected, p=2, dim=1)
        return normalized

In [16]:
contrastive_model = ConstrastiveFinetuneModel()
contrastive_model.to(device)

Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ConstrastiveFinetuneModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [23]:
contrastive_learning_metrics = contrastive_loss_training(
    contrastive_model,
    contrastive_learning_dataset,
    epochs=6,
    batch_size=96,
    accumulation_steps=1,
    lr=5e-6,
    log_freq=5,
    temperature=0.1,
)

Batch 5 - Avg Train Loss: 4.5703 
Batch 10 - Avg Train Loss: 4.5543 
Batch 15 - Avg Train Loss: 4.5560 
Batch 20 - Avg Train Loss: 4.5730 
Batch 25 - Avg Train Loss: 4.5590 
Batch 30 - Avg Train Loss: 4.5614 
Batch 35 - Avg Train Loss: 4.5617 
Batch 40 - Avg Train Loss: 4.5646 
Batch 45 - Avg Train Loss: 4.5619 
Batch 50 - Avg Train Loss: 4.5601 
Epoch 1 completed.
Batch 5 - Avg Train Loss: 4.5697 
Batch 10 - Avg Train Loss: 4.5604 
Batch 15 - Avg Train Loss: 4.5653 
Batch 20 - Avg Train Loss: 4.5604 
Batch 25 - Avg Train Loss: 4.5570 
Batch 30 - Avg Train Loss: 4.5515 
Batch 35 - Avg Train Loss: 4.5553 
Batch 40 - Avg Train Loss: 4.5548 
Batch 45 - Avg Train Loss: 4.5455 
Batch 50 - Avg Train Loss: 4.5505 
Epoch 2 completed.
Batch 5 - Avg Train Loss: 4.5592 
Batch 10 - Avg Train Loss: 4.5496 
Batch 15 - Avg Train Loss: 4.5474 
Batch 20 - Avg Train Loss: 4.5399 
Batch 25 - Avg Train Loss: 4.5405 
Batch 30 - Avg Train Loss: 4.5551 
Batch 35 - Avg Train Loss: 4.5438 
Batch 40 - Avg Train

In [24]:
contrastive_learning_metrics.to_pd().to_csv(
    "results/contrastive_model_training_metrics.csv", index=False
)
save_contrastive_model(contrastive_model, tokenizer, path="models")

Model saved to: models


### Finetunning with contrastive

In [21]:
class ContrastiveFinetunedClassifier(nn.Module):
    def __init__(
        self,
        model_path="models",
        num_labels=3,
        dropout=0.3,
    ):
        super().__init__()
        self.model = load_contrastive_model(ConstrastiveFinetuneModel, path=model_path)[
            0
        ]
        hidden_size = 256

        for param in self.model.parameters():
            param.requires_grad = False

        for layer_idx in range(8, 12):
            for param in self.model.model.encoder.layer[layer_idx].parameters():
                param.requires_grad = True

        for param in self.model.projection.parameters():
            param.requires_grad = True

        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask):
        cls = self.model(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(cls)
        logits = self.classifier(x)
        return logits

In [23]:
constastive_calsifier = ContrastiveFinetunedClassifier()
constastive_calsifier.to(device)

Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model loaded from: models


ContrastiveFinetunedClassifier(
  (model): ConstrastiveFinetuneModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(50000, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-23): 24 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_feat

In [24]:
import torch.nn.functional as F

start_time = time.time()

training_metrics = train(
    constastive_calsifier,
    train_datast,
    val_datast,
    epochs=3,
    batch_size=16,
    lr=1e-5,
    log_freq=10,
)

end_time = time.time()
total_training_time = end_time - start_time
print(f"Total training time: {total_training_time:.2f} seconds")

Batch 10 - Avg Train Loss: 1.1160 Validation Loss: 1.1080, Accuracy: 0.2103, F1 Score: 0.0731
Batch 20 - Avg Train Loss: 1.1066 Validation Loss: 1.1032, Accuracy: 0.2122, F1 Score: 0.0801
Batch 30 - Avg Train Loss: 1.0998 Validation Loss: 1.0991, Accuracy: 0.3044, F1 Score: 0.2783
Batch 40 - Avg Train Loss: 1.0970 Validation Loss: 1.0938, Accuracy: 0.4871, F1 Score: 0.4046
Batch 50 - Avg Train Loss: 1.0927 Validation Loss: 1.0883, Accuracy: 0.4779, F1 Score: 0.3345
Batch 60 - Avg Train Loss: 1.0797 Validation Loss: 1.0784, Accuracy: 0.5037, F1 Score: 0.3374
Batch 70 - Avg Train Loss: 1.0862 Validation Loss: 1.0718, Accuracy: 0.5037, F1 Score: 0.3374
Batch 80 - Avg Train Loss: 1.0775 Validation Loss: 1.0660, Accuracy: 0.5037, F1 Score: 0.3374
Batch 90 - Avg Train Loss: 1.0551 Validation Loss: 1.0580, Accuracy: 0.5037, F1 Score: 0.3374
Batch 100 - Avg Train Loss: 1.0496 Validation Loss: 1.0494, Accuracy: 0.5037, F1 Score: 0.3374
Batch 110 - Avg Train Loss: 1.0482 Validation Loss: 1.0426,

In [25]:
training_metrics.to_pd().to_csv(
    "results/finetune_model_contrastive_training_metrics.csv", index=False
)

y_pred_all, avg_batch_time = inference_with_timing(
    constastive_calsifier, test_loader, device=device
)

print(f"**inference_time** for batch of size 16: {avg_batch_time:.4f} seconds")
print(
    f"Test accuracy: {accuracy_score(test_dataset.df['label'], y_pred_all.numpy()):.4f}"
)
print(
    f"Test f1: {f1_score(test_dataset.df['label'], y_pred_all.numpy(), average='weighted'):.4f}"
)

Inference: 100%|██████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:03<00:00, 11.05batch/s]

**inference_time** for batch of size 16: 0.0137 seconds
Test accuracy: 0.6593
Test f1: 0.6556





**summary on contrastive finetunned**

Training time: 366.93s

Average inference_time per batch of size 16: 0.0137 seconds

Test accuracy: 0.6593

Test f1: 0.6556