1. Baseline modele trenowane na mastert test set - (4 modele x 4 datasety = 16 ewaluacji) 
2. 20 Llmow finetunowanych na master_large (?) - (20 modelow x 1 dataset = 20 ewaluacji)
3. 20 llmow finetunowanych do detekcji samego siebie - (20 modelow x 1 datatset = 20 ewaluacji), tutaj agregacja potrzebna
4. 20 llmow finetunowanych do deteckji rodziny (20 modelow x 1 dataset = 20 ewaluacji), agregacja

In [None]:
import os
from typing import Tuple, List, Dict
import pandas as pd
import torch
import torch.nn as nn
import os
from typing import Dict, List, Union

import torch
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score,
                             precision_score, recall_score, roc_auc_score)
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

In [11]:
class TextDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int]) -> None:
        """
        texts: list of texts.
        labels: list of labels for all samples.
        """
        self.texts = texts
        self.labels = labels

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> Dict[str, Union[str, int]]:
        text = self.texts[idx]
        label = self.labels[idx]

        return {"text": text, "label": label}


def get_csv_paths(folder_path: str, recursive: bool = False) -> List[str]:
    if recursive:
        # Walk through all subdirectories
        file_paths = [
            os.path.join(root, file)
            for root, _, files in os.walk(folder_path)
            for file in files
            if file.endswith(".csv")
        ]
    else:
        # Get files in the root folder only
        file_paths = [
            os.path.join(folder_path, file)
            for file in os.listdir(folder_path)
            if file.endswith(".csv")
        ]

    return file_paths


def collate_fn(
    batch: List[Dict[str, torch.tensor]], tokenizer: AutoTokenizer
) -> Dict[str, torch.tensor]:
    texts = [item["text"] for item in batch]
    labels = [item["label"] for item in batch]
    encodings = tokenizer(
        texts, truncation=True, padding="longest", return_tensors="pt"
    )

    labels_padded = [
        torch.where(t == 0, torch.tensor(-100), torch.tensor(label))
        for t, label in zip(encodings["attention_mask"], labels)
    ]
    labels_padded = torch.cat(labels_padded)
    encodings["labels"] = labels_padded

    return encodings


def evaluate(
    model: torch.nn.Module, dataloader: DataLoader, device: str, type: str
) -> Dict[str, float]:
    model.eval()
    preds, targets = [], []
    total_loss = 0.0
    loss_fn = BCEWithLogitsLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            if type == "baseline":
                outputs = model(input_ids)
            elif type == "finetune":
                attention_mask = batch["attention_mask"].to(device)
                outputs = model(input_ids, attention_mask)
            else:
                raise Exception(
                    "Wrong training type, should be 'baseline' or 'finetune'."
                )

            mask = labels.view(-1) != -100
            labels = labels.view(-1)[mask].float()
            outputs = outputs.view(-1)[mask]

            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            logits = torch.sigmoid(outputs).squeeze().cpu().numpy()
            labels = labels.squeeze().cpu().numpy()

            preds.extend(logits)
            targets.extend(labels)

    bin_preds = [1 if p >= 0.5 else 0 for p in preds]

    metrics = {
        "loss": total_loss / len(dataloader),
        "accuracy": accuracy_score(targets, bin_preds),
        "balanced_accuracy": balanced_accuracy_score(targets, bin_preds),
        "precision": precision_score(targets, bin_preds),
        "recall": recall_score(targets, bin_preds),
        "f1": f1_score(targets, bin_preds),
        "auc": roc_auc_score(targets, preds),
    }

    return metrics

import torch
import torch.nn as nn
from transformers import AutoModel


class FineTuneClassifier(nn.Module):
    def __init__(self, base_model_path: str, num_labels: int) -> None:
        super(FineTuneClassifier, self).__init__()
        self.base_model = AutoModel.from_pretrained(base_model_path)

        for param in self.base_model.parameters():
            param.requires_grad = False

        self.classifier = nn.Linear(self.base_model.config.hidden_size * 2, num_labels)

    @classmethod
    def from_classifier_head(
        cls, base_model_path: str, path: str, num_labels: int
    ) -> nn.Module:
        model = cls(base_model_path, num_labels)
        model.classifier.load_state_dict(torch.load(path))
        return model

    def forward(
        self, input_ids: torch.tensor, attention_mask: torch.tensor
    ) -> torch.tensor:
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        B, T, C = outputs.logits.shape

        all_tokens_hidden = outputs.logits  # (B, T, C)
        last_token_hidden = outputs.logits[:, -1, :]  # (B, C)
        last_token_hidden = last_token_hidden.unsqueeze(1).expand(B, T, C)

        combined_representation = torch.cat(
            (all_tokens_hidden, last_token_hidden), dim=-1
        )
        logits = self.classifier(combined_representation)
        return logits


class BaselineClassifier(nn.Module):
    def __init__(
        self,
        d_model: int,
        num_layers: int,
        nhead: int,
        max_seq_length: int,
        vocab_size: int,
        pad_token_id: int,
        num_labels: int,
    ) -> None:
        super(BaselineClassifier, self).__init__()
        self.pad_token_id = pad_token_id
        self.token_embedding = nn.Embedding(
            vocab_size, d_model, padding_idx=pad_token_id
        )
        self.pos_embedding = nn.Embedding(max_seq_length, d_model)
        decoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(decoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(d_model * 2, num_labels)

    def forward(self, token_ids: torch.tensor) -> torch.tensor:
        batch_size, seq_len = token_ids.shape

        token_emb = self.token_embedding(token_ids)
        pos_ids = torch.arange(seq_len, device=token_ids.device).unsqueeze(0)
        pos_emb = self.pos_embedding(pos_ids)
        embeddings = token_emb + pos_emb

        causal_mask = torch.triu(
            torch.ones(seq_len, seq_len, device=token_ids.device, dtype=torch.bool),
            diagonal=1,
        )

        pad_mask = token_ids.eq(self.pad_token_id)  # shape: (batch_size, seq_len)

        output = self.transformer(
            embeddings, mask=causal_mask, src_key_padding_mask=pad_mask
        )

        B, T, C = output.shape
        all_tokens_hidden = output  # (B, T, C)
        last_token_hidden = output[:, -1, :]  # (B, C)
        last_token_hidden = last_token_hidden.unsqueeze(1).expand(B, T, C)

        combined_representation = torch.cat(
            (all_tokens_hidden, last_token_hidden), dim=-1
        )
        logits = self.classifier(combined_representation)
        return logits
BASELINE_MODELS: Dict[str, Dict[str, int]] = {
    "mini": {
        "d_model": 512,
        "num_layers": 6,
        "num_heads": 8,
        "max_len": 512,
    },
    "small": {
        "d_model": 768,
        "num_layers": 12,
        "num_heads": 12,
        "max_len": 512,
    },
    "medium": {
        "d_model": 1024,
        "num_layers": 24,
        "num_heads": 16,
        "max_len": 512,
    },
    "large": {
        "d_model": 1536,
        "num_layers": 36,
        "num_heads": 24,
        "max_len": 512,
    },
}

In [None]:
CHECKPOINTS_PATH: str = "../../../checkpoints/"
DATASETS_PATH: str = "../../../data/datasets/"
BATCH_SIZE = 32
baseline_checkpoints = get_csv_paths(CHECKPOINTS_PATH + "baseline/")
finetune_checkpoints = get_csv_paths(CHECKPOINTS_PATH + "finetune/")

In [None]:
df_test = pd.read_csv(DATASETS_PATH + "master_testset/test.csv")
test_dataset = TextDataset(df_test["text"].tolist(), df_test["labels"].tolist())

In [None]:
results = {"model_name": [], "train_ds_name": [], "training_type": [], "loss": [], "accuracy": [], "balanced_accuracy": [], "precision": [], "recall": [], "f1": [], "auc": []}

In [None]:
for path in baseline_checkpoints:
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
    tokenizer.pad_token = "<|finetune_right_pad_id|>"
    test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, tokenizer),
    )

    model_size = path.split("_")[1]
    train_ds_name = path.split("_")[2]
    model_config = BASELINE_MODELS[model_size]
    model = BaselineClassifier(model_config, num_labels=1)
    model.load_state_dict(torch.load(path, weights_only=True))
    metrics = evaluate(model, test_loader, "cuda", "baseline")

    # save metrics