In [1]:
import os
from dotenv import load_dotenv


load_dotenv()

token = os.getenv("HF_TOKEN")

model_name = "google-bert/bert-base-uncased" #"SamLowe/roberta-base-go_emotions"


from datasets import load_from_disk, Dataset

data = load_from_disk("../../data/google-research-datasets-go_emotions")



from transformers import DataCollatorWithPadding, AutoTokenizer
from torch.utils.data import DataLoader


tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=512, truncation=True)


def make_loader(dataset, batch_size: int = 16):
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

unpack_label = {
    0: "admiration",
    1: "anger",
    2: "annoyance",
    3: "disappointment",
    4: "disapproval",
    5: "disgust",
    6: "excitement",
    7: "gratitude",
    8: "joy",
    9: "optimism",
    10: "sadness",
    11: "neutral"
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = make_loader(data["train"])
eval_data = make_loader(data["validation"])
test_data = make_loader(data["test"])

In [3]:
import torch


labels = torch.tensor(data["train"]["labels"], dtype=torch.int8)
print(labels)
print(labels.shape)

classes = labels.sum(dim=0)
print(classes)

pos_weight = len(labels) / classes - 1
print(pos_weight)

mn, mx = pos_weight.min(), pos_weight.max()
scale_mn, scale_mx = 1.25, 2
pos_weight = (pos_weight - mn) / (mx - mn) * (scale_mx - scale_mn) + scale_mn
print(pos_weight)

tensor([[0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0]], dtype=torch.int8)
torch.Size([23597, 12])
tensor([4116, 1564, 2628, 1266, 2018,  791,  843, 2618, 1689, 1575, 1375, 6085])
tensor([ 4.7330, 14.0876,  7.9791, 17.6390, 10.6933, 28.8319, 26.9917,  8.0134,
        12.9710, 13.9822, 16.1615,  2.8779])
tensor([1.3036, 1.5739, 1.3974, 1.6766, 1.4758, 2.0000, 1.9468, 1.3984, 1.5417,
        1.5709, 1.6339, 1.2500])


In [4]:
import json
from collections import defaultdict


def count_matches(preds, labels, matches):
    for pred, gt in zip(preds, labels):
        pred, gt = {i for i, j in enumerate(pred) if j}, {i for i, j in enumerate(gt) if j}
        for lab in gt & pred:
            matches[lab]["tp"] += 1
        for lab in pred - gt:
            matches[lab]["fp"] += 1
        for lab in gt - pred:
            matches[lab]["fn"] += 1


def calc_metrics(tp=0, fp=0, fn=0, tn=0):
    accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) else 0.0
    precision = tp / (fp + tp) if (fp + tp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1-score": f1}

## Оптимизация с дополнительным смещением логитов

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput


def make_forward(model):
    forward_func = model.forward
    loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    def forward(**inputs):
        labels = inputs.pop("labels") if "labels" in inputs else None
        output = forward_func(**inputs)
        logits = output.logits - model.classifier.custom_levels

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            loss = loss_func(logits, labels.float())

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=output.hidden_states,
            attentions=output.attentions,
        )

    return forward


def make_predict(model):
    def predict(**inputs):
        logits = model(**inputs).logits
        probs = torch.sigmoid(logits)
        return [torch.where(p > 0.5)[0].tolist() for p in probs]

    return predict


def init_model(model_name, layers_to_finetune: int = 5):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(unpack_label),
        ignore_mismatched_sizes=True,
    )
    model.classifier.custom_levels = torch.nn.Parameter(
        torch.randn(
            size=(1, model.num_labels),
            requires_grad=True,
            dtype=torch.float32,
        )
    )
    model.forward = make_forward(model)
    model.predict = make_predict(model)

    # 1. Замораживаем все слои модели
    for param in model.base_model.parameters():
        param.requires_grad = False

    # 2. Размораживаем последние layers_to_finetune слоёв
    for param in model.base_model.encoder.layer[-layers_to_finetune:].parameters():
        param.requires_grad = True

    # 3. Убеждаемся, что классификатор тоже обучается (если нужно)
    for param in model.classifier.parameters():
        param.requires_grad = True

    return model.to(device="cuda" if torch.cuda.is_available() else "cpu")

model = init_model(model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([12]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([12, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.load_state_dict(torch.load("../../models/classifier/optimize_custom_levels/model_best/model.pt"))

## Оптимизация с нормализацией логитов

In [5]:
import torch
from torch.nn import BatchNorm1d
from transformers import AutoModelForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def make_forward(model):
    forward_func = model.forward
    loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))

    def forward(**inputs):
        labels = inputs.pop("labels") if "labels" in inputs else None
        output = forward_func(**inputs)
        bn_logits = model.classifier.normalizer(output.logits)

        loss = None
        if labels is not None:
            labels = labels.to(bn_logits.device)
            loss = loss_func(bn_logits, labels.float())

        return SequenceClassifierOutput(
            loss=loss,
            logits=bn_logits,
            hidden_states=output.hidden_states,
            attentions=output.attentions,
        )

    return forward


def make_predict(model):
    def predict(**inputs):
        logits = model(**inputs).logits
        probs = torch.sigmoid(logits)
        return [torch.where(p > 0.5)[0].tolist() for p in probs]

    return predict


def init_model(model_name, layers_to_finetune: int = 5):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(unpack_label),
        ignore_mismatched_sizes=True,
    )
    model.classifier.normalizer = BatchNorm1d(model.num_labels)
    model.forward = make_forward(model)
    model.predict = make_predict(model)

    # 1. Замораживаем все слои модели
    for param in model.base_model.parameters():
        param.requires_grad = False

    # 2. Размораживаем последние layers_to_finetune слоёв
    for param in model.base_model.encoder.layer[-layers_to_finetune:].parameters():
        param.requires_grad = True

    # 3. Убеждаемся, что классификатор тоже обучается (если нужно)
    for param in model.classifier.parameters():
        param.requires_grad = True

    return model.to(device)

model = init_model(model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([28]) in the checkpoint and torch.Size([12]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([28, 768]) in the checkpoint and torch.Size([12, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# model.load_state_dict(torch.load("../../models/classifier/bn/model_best/model.pt"))

<All keys matched successfully>

In [None]:
import gc

from tqdm import tqdm
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau


f1_macro_best = threshold_best = 0.5
epoches = 10
batch_size = 64
optimizer = AdamW(model.parameters(), lr=1.5e-5)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.65, patience=0, verbose=True)

export_path = "./models/classifier/bn/"
os.makedirs(export_path, exist_ok=True)
os.makedirs(export_path + "model_best", exist_ok=True)
os.makedirs(export_path + "model_last", exist_ok=True)

thresholds = [round(v, 2) for v in torch.arange(0.5, 0.75, 0.02).tolist()]

metrics = {"train": {}, "eval": {}}
for epoch in range(epoches):
    print(f"\n{epoch=}")
    
    model.train()
    train_losses, train_metrics = [], defaultdict(lambda: defaultdict(int))
    for batch in tqdm(train_data):
        batch.to(model.device)
        optimizer.zero_grad()
        outputs = model(**batch)
        outputs.loss.backward()
        optimizer.step()

        train_losses.append(outputs.loss.item())

        count_matches(
            (torch.sigmoid(outputs.logits) > threshold_best).int().tolist(),
            batch["labels"].tolist(),
            train_metrics,
        )

        torch.cuda.empty_cache()
        gc.collect()

    metrics["train"][epoch] = {unpack_label[lab]: calc_metrics(**train_metrics[lab]) for lab in unpack_label}
    f1_macro = sum(m["f1-score"] for m in metrics["train"][epoch].values()) / len(unpack_label)
    metrics["train"][epoch]["f1_macro"] = f1_macro

    print(f"train loss={sum(train_losses) / len(train_losses):.4f}; f1_macro={f1_macro:.4f}")

    model.eval()
    eval_losses, eval_metrics = [], defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    with torch.no_grad():
        for batch in tqdm(eval_data):
            batch.to(model.device)
            outputs = model(**batch)

            eval_losses.append(outputs.loss.item())

            for threshold in thresholds:
                count_matches(
                    (torch.sigmoid(outputs.logits) > threshold).int().tolist(),
                    batch["labels"].tolist(),
                    eval_metrics[threshold],
                )

            torch.cuda.empty_cache()
            gc.collect()

    m_vals = {}
    for threshold in thresholds:
        m_vals[threshold] = {
            unpack_label[lab]: calc_metrics(**eval_metrics[threshold][lab]) for lab in unpack_label
        }

        f1_macro = sum(m["f1-score"] for m in m_vals[threshold].values()) / len(unpack_label)

        m_vals[threshold]["f1_macro"] = f1_macro
    
        if f1_macro > f1_macro_best:
            f1_macro_best = f1_macro
            threshold_best = threshold
            torch.save(model.state_dict(), os.path.join(export_path, "model_best", "model.pt"))
    
        print(f"{threshold=}; {f1_macro=:.4f}")
    
    metrics["eval"][epoch] = m_vals

    val_loss = sum(eval_losses) / len(eval_losses)
    scheduler.step(val_loss)
    
    print(f"{val_loss=:.4f}; {f1_macro_best=:.4f}")
    
torch.save(model.state_dict(), os.path.join(export_path, "model_last", "model.pt"))

with open(os.path.join(export_path, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=4, ensure_ascii=False)

In [None]:
import gc

from tqdm import tqdm
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau


metrics = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

f1_macro_best = 0
threshold_best = 0.5

model.eval()
with torch.no_grad():
    for batch in tqdm(eval_data):
        batch.to(model.device)
        outputs = model(**batch)
        
        for threshold in [0.5, 0.6, 0.65, 0.7, 0.75, 0.8]:
            count_matches(
                (torch.sigmoid(outputs.logits) > threshold).int().tolist(),
                batch["labels"].tolist(),
                metrics[threshold],
            )

        torch.cuda.empty_cache()
        gc.collect()

    for threshold in [0.5, 0.6, 0.65, 0.7, 0.75, 0.8]:
        metrics[threshold] = {unpack_label[lab]: calc_metrics(**metrics[threshold][lab]) for lab in unpack_label}

        f1_macro = sum(m["f1-score"] for m in metrics[threshold].values()) / len(unpack_label)

        metrics[threshold]["f1_macro"] = f1_macro

        print(f"{f1_macro=:.4f}; {f1_macro_best=:.4f}")

        if f1_macro > f1_macro_best:
            f1_macro_best = f1_macro
            threshold_best = threshold
        
        print(f"{f1_macro_best=}; {threshold_best=}")

100%|██████████| 248/248 [03:53<00:00,  1.06it/s]

f1_macro=0.4666; f1_macro_best=0.0000
f1_macro_best=0.46661603613552205; threshold_best=0.5
f1_macro=0.5227; f1_macro_best=0.4666
f1_macro_best=0.5226749234909143; threshold_best=0.6
f1_macro=0.5351; f1_macro_best=0.5227
f1_macro_best=0.5350511492792154; threshold_best=0.65
f1_macro=0.5418; f1_macro_best=0.5351
f1_macro_best=0.5417763695171706; threshold_best=0.7
f1_macro=0.5483; f1_macro_best=0.5418
f1_macro_best=0.5482732668662416; threshold_best=0.75
f1_macro=0.5417; f1_macro_best=0.5483
f1_macro_best=0.5482732668662416; threshold_best=0.75





## Оптимизация уровней активации для каждого класса

In [5]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput

model_name = "google-bert/bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class MultilabelFocalLoss:

    def __init__(self, pos_weight=None):
        self.pos_weight = pos_weight


    def __call__(self, logits, targets, alpha=0.25, gamma=2.0, eps=1e-5):
        targets = targets.float()

        if self.pos_weight is not None:
            alphas = ((targets * (self.pos_weight - 1)) + alpha * (1 - targets)) 
        else:
            alphas = alpha
      
        probs = torch.sigmoid(logits)
        pt = probs * targets + (1 - probs) * (1 - targets)
        focal_loss = -alphas * ((1 - pt) ** gamma * torch.log(pt + eps))

        return focal_loss.sum()


class BertForMultiLabelClassification(nn.Module):

    def __init__(
            self, 
            num_labels: int, 
            pos_weight: torch.tensor, 
            device: torch.device
        ):
        super(BertForMultiLabelClassification, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased").to(device)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels, device=device)
        self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
        self.threshold_loss_fn = MultilabelFocalLoss(pos_weight=pos_weight.to(device))
        self.threshold_levels = nn.Parameter(torch.zeros(num_labels, device=device), requires_grad=True)
        
        self.device = device


    def forward(self, **inputs) -> SequenceClassifierOutput:
        labels = inputs.pop("labels") if "labels" in inputs else None

        outputs = self.bert(**inputs)
        pooled_output = outputs.pooler_output  # [CLS] токен
        x = self.dropout(pooled_output)
        logits = self.classifier(x)  # логиты
        
        loss = None
        if labels is not None:
            labels = labels.to(self.device)
            loss = self.loss_fn(logits, labels)  # labels.float())

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    
    @property
    def thresholds(self):
        return torch.sigmoid(self.threshold_levels)  # Ограничение порогов (0; 1)


model = BertForMultiLabelClassification(
    num_labels=len(unpack_label), 
    pos_weight=pos_weight, 
    device=device,
)

In [9]:
model.load_state_dict(torch.load("./models/classifier/optimize_levels_on_test/model_epoch/model1.pt"))

<All keys matched successfully>

In [10]:
from tqdm import tqdm
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau


f1_macro_best = 0.5
epoches = 10
batch_size = 64
threshold_learn_speed = 5

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.7, patience=0, verbose=True)

threshold_optimizer = AdamW([model.threshold_levels], lr=2e-4)
threshold_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=0, verbose=True)

export_path = "./models/classifier/optimize_levels_on_test/"
os.makedirs(export_path, exist_ok=True)
os.makedirs(export_path + "model_best", exist_ok=True)
os.makedirs(export_path + "model_last", exist_ok=True)
os.makedirs(export_path + "model_epoch", exist_ok=True)


try:
    metrics = {"train": {}, "eval": {}}
    for epoch in range(2, epoches):
        print(f"\n{epoch=}")
        
        model.train()

        # Оптимизация классификатора
        train_losses, train_metrics = [], defaultdict(lambda: defaultdict(int))
        for batch in tqdm(train_data):
            batch.to(model.device)

            
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()

            train_losses.append(outputs.loss.item())

            count_matches(
                (torch.sigmoid(outputs.logits) > model.thresholds).int().tolist(),
                batch["labels"].tolist(),
                train_metrics,
            )

            torch.cuda.empty_cache()

        metrics["train"][epoch] = {unpack_label[lab]: calc_metrics(**train_metrics[lab]) for lab in unpack_label}
        f1_macro = sum(m["f1-score"] for m in metrics["train"][epoch].values()) / len(unpack_label)
        metrics["train"][epoch]["f1_macro"] = f1_macro

        print(f"train loss={sum(train_losses) / len(train_losses):.4f}; f1_macro={f1_macro:.4f}")

        if epoch < 2:
            torch.save(model.state_dict(), os.path.join(export_path, "model_epoch", f"model{epoch}.pt"))
        
        # Валидация
        model.eval()
        eval_losses, eval_metrics = [], defaultdict(lambda: defaultdict(int))
        with torch.no_grad():
            for batch in tqdm(eval_data):
                batch.to(model.device)
                outputs = model(**batch)

                eval_losses.append(outputs.loss.item())

                count_matches(
                    (torch.sigmoid(outputs.logits) > model.thresholds).int().tolist(),
                    batch["labels"].tolist(),
                    eval_metrics,
                )

                torch.cuda.empty_cache()
                
        metrics["eval"][epoch] = {
            unpack_label[lab]: calc_metrics(**eval_metrics[lab]) for lab in unpack_label
        }

        f1_macro = sum(m["f1-score"] for m in metrics["eval"][epoch].values()) / len(unpack_label)
        metrics["eval"][epoch]["f1_macro"] = f1_macro
        if f1_macro > f1_macro_best:
            f1_macro_best = f1_macro
            torch.save(model.state_dict(), os.path.join(export_path, "model_best", "model.pt"))

        val_loss = sum(eval_losses) / len(eval_losses)
        scheduler.step(f1_macro)
        
        print(f"{val_loss=:.4f}; {f1_macro=:.4f}; {f1_macro_best=:.4f}")

        if input() != "":
            break

finally:
    torch.save(model.state_dict(), os.path.join(export_path, "model_last", "model.pt"))

    with open(os.path.join(export_path, "metrics.json"), "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=4, ensure_ascii=False)


epoch=2


  0%|          | 0/1475 [00:00<?, ?it/s]

100%|██████████| 1475/1475 [10:44<00:00,  2.29it/s]


train loss=0.1462; f1_macro=0.7300


100%|██████████| 185/185 [00:22<00:00,  8.37it/s]


val_loss=0.2069; f1_macro=0.6058; f1_macro_best=0.6058

epoch=3


100%|██████████| 1475/1475 [10:44<00:00,  2.29it/s]


train loss=0.1116; f1_macro=0.7995


100%|██████████| 185/185 [00:22<00:00,  8.40it/s]


val_loss=0.2167; f1_macro=0.6188; f1_macro_best=0.6188


In [11]:
try:
    for epoch in range(epoches):        
        print(f"\n{epoch=}")
        
        # Оптимизация трешхолдов
        threshold_losses = []
        for batch in tqdm(train_data):
            batch.to(model.device)
            
            model.eval()
            with torch.no_grad():
                outputs = model(**batch)

            model.train()
            threshold_optimizer.zero_grad()
            model_probs = torch.sigmoid(outputs.logits)
            threshold_logits = (model_probs - model.thresholds) * threshold_learn_speed
            threshold_loss = model.threshold_loss_fn(threshold_logits, batch["labels"])
            threshold_loss.backward()
            threshold_optimizer.step()

            threshold_losses.append(threshold_loss.item())

            torch.cuda.empty_cache()
        
        threshold_scheduler.step(sum(threshold_losses) / len(threshold_losses))
        print(f"threshold loss={sum(threshold_losses) / len(threshold_losses):.4f}")
        print(f"{model.thresholds=}")

        # Валидация
        model.eval()
        eval_losses, eval_metrics = [], defaultdict(lambda: defaultdict(int))
        with torch.no_grad():
            for batch in tqdm(eval_data):
                batch.to(model.device)
                outputs = model(**batch)

                eval_losses.append(outputs.loss.item())

                count_matches(
                    (torch.sigmoid(outputs.logits) > model.thresholds).int().tolist(),
                    batch["labels"].tolist(),
                    eval_metrics,
                )

                torch.cuda.empty_cache()
                
        metrics["eval"][epoch] = {
            unpack_label[lab]: calc_metrics(**eval_metrics[lab]) for lab in unpack_label
        }

        f1_macro = sum(m["f1-score"] for m in metrics["eval"][epoch].values()) / len(unpack_label)
        metrics["eval"][epoch]["f1_macro"] = f1_macro
        if f1_macro > f1_macro_best:
            f1_macro_best = f1_macro
            torch.save(model.state_dict(), os.path.join(export_path, "model_best", "model.pt"))

        val_loss = sum(eval_losses) / len(eval_losses)
        scheduler.step(f1_macro)
        
        print(f"{val_loss=:.4f}; {f1_macro=:.4f}; {f1_macro_best=:.4f}")

        if input() != "":
            break

finally:
    torch.save(model.state_dict(), os.path.join(export_path, "model_last", "model.pt"))

    with open(os.path.join(export_path, "metrics.json"), "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=4, ensure_ascii=False)


epoch=0


100%|██████████| 1475/1475 [02:57<00:00,  8.31it/s]


threshold loss=1.4135
model.thresholds=tensor([0.5015, 0.4916, 0.4393, 0.4306, 0.4606, 0.4673, 0.4807, 0.4628, 0.4648,
        0.4765, 0.4885, 0.4448], device='cuda:0', grad_fn=<SigmoidBackward0>)


100%|██████████| 185/185 [00:22<00:00,  8.35it/s]


val_loss=0.2170; f1_macro=0.6214; f1_macro_best=0.6214

epoch=1


100%|██████████| 1475/1475 [02:57<00:00,  8.31it/s]


threshold loss=1.3756
model.thresholds=tensor([0.5076, 0.4982, 0.4176, 0.4104, 0.4524, 0.4599, 0.4742, 0.4488, 0.4620,
        0.4854, 0.4991, 0.4200], device='cuda:0', grad_fn=<SigmoidBackward0>)


100%|██████████| 185/185 [00:22<00:00,  8.39it/s]


val_loss=0.2170; f1_macro=0.6223; f1_macro_best=0.6223

epoch=2


100%|██████████| 1475/1475 [02:57<00:00,  8.31it/s]


threshold loss=1.3493
model.thresholds=tensor([0.5127, 0.5039, 0.4008, 0.3931, 0.4453, 0.4524, 0.4683, 0.4362, 0.4597,
        0.4936, 0.5085, 0.3990], device='cuda:0', grad_fn=<SigmoidBackward0>)


100%|██████████| 185/185 [00:22<00:00,  8.38it/s]


val_loss=0.2168; f1_macro=0.6238; f1_macro_best=0.6238

epoch=3


100%|██████████| 1475/1475 [02:57<00:00,  8.31it/s]


threshold loss=1.3318
model.thresholds=tensor([0.5169, 0.5087, 0.3887, 0.3789, 0.4399, 0.4460, 0.4631, 0.4260, 0.4579,
        0.5000, 0.5171, 0.3828], device='cuda:0', grad_fn=<SigmoidBackward0>)


100%|██████████| 185/185 [00:22<00:00,  8.37it/s]


val_loss=0.2168; f1_macro=0.6243; f1_macro_best=0.6243

epoch=4


100%|██████████| 1475/1475 [02:57<00:00,  8.31it/s]


threshold loss=1.3204
model.thresholds=tensor([0.5205, 0.5126, 0.3795, 0.3673, 0.4348, 0.4404, 0.4587, 0.4179, 0.4562,
        0.5058, 0.5244, 0.3698], device='cuda:0', grad_fn=<SigmoidBackward0>)


100%|██████████| 185/185 [00:22<00:00,  8.38it/s]


val_loss=0.2166; f1_macro=0.6246; f1_macro_best=0.6246

epoch=5


  3%|▎         | 38/1475 [00:04<02:56,  8.16it/s]


KeyboardInterrupt: 

## ...

In [5]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput

model_name = "google-bert/bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class MultilabelFocalLoss(nn.Module):

    def __init__(self, pos_weight=None):
        super().__init__()
        self.pos_weight = pos_weight


    def forward(self, logits, targets, alpha=0.5, gamma=2.0, eps=1e-5):
        targets = targets.float()

        if self.pos_weight is not None:
            alphas = ((targets * (self.pos_weight - 1)) + alpha * (1 - targets)) 
        else:
            alphas = alpha
      
        probs = torch.sigmoid(logits)
        pt = probs * targets + (1 - probs) * (1 - targets)
        focal_loss = -alphas * ((1 - pt) ** gamma * torch.log(pt + eps))

        return focal_loss.mean()


class BertForMultiLabelClassification(nn.Module):

    def __init__(
            self, 
            num_labels: int, 
            pos_weight: torch.Tensor, 
            device: torch.device
        ):
        super(BertForMultiLabelClassification, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased").to(device)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels, device=device)
        self.loss_fn = MultilabelFocalLoss(pos_weight=pos_weight.to(device))
        self.threshold_levels = nn.Parameter(torch.zeros(num_labels, device=device), requires_grad=True)
        
        self.device = device


    def forward(self, sharphess_coef: float = 5., **inputs) -> SequenceClassifierOutput:
        labels = inputs.pop("labels") if "labels" in inputs else None

        outputs = self.bert(**inputs)
        pooled_output = outputs.pooler_output  # [CLS] токен
        x = self.dropout(pooled_output)
        logits = self.classifier(x)  # логиты
        cut_logits = (torch.sigmoid(logits) - self.thresholds) * sharphess_coef
        
        loss = None
        if labels is not None:
            labels = labels.to(self.device)
            loss = self.loss_fn(cut_logits, labels)  # labels.float())

        return SequenceClassifierOutput(
            loss=loss,
            logits=cut_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    
    @property
    def thresholds(self):
        return torch.sigmoid(self.threshold_levels)  # Ограничение порогов (0; 1)


model = BertForMultiLabelClassification(
    num_labels=len(unpack_label), 
    pos_weight=pos_weight, 
    device=device,
)

In [6]:
from tqdm import tqdm
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau


f1_macro_best = 0.0
epoches = 10
batch_size = 64

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.2, patience=0, verbose=True)

export_path = "./models/classifier/optimize_thresholds/"
os.makedirs(export_path, exist_ok=True)
os.makedirs(export_path + "model_best", exist_ok=True)
os.makedirs(export_path + "model_last", exist_ok=True)
os.makedirs(export_path + "model_epoch", exist_ok=True)


try:
    metrics = {"train": {}, "eval": {}}
    for epoch in range(epoches):
        print(f"\n{epoch=}")
        
        model.train()

        # Оптимизация классификатора
        train_losses, train_metrics = [], defaultdict(lambda: defaultdict(int))
        for batch in tqdm(train_data):
            batch.to(model.device)

            
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()

            train_losses.append(outputs.loss.item())

            count_matches(
                (torch.sigmoid(outputs.logits) > 0.5).int().tolist(),
                batch["labels"].tolist(),
                train_metrics,
            )

            torch.cuda.empty_cache()

        metrics["train"][epoch] = {unpack_label[lab]: calc_metrics(**train_metrics[lab]) for lab in unpack_label}
        f1_macro = sum(m["f1-score"] for m in metrics["train"][epoch].values()) / len(unpack_label)
        metrics["train"][epoch]["f1_macro"] = f1_macro

        print(f"train loss={sum(train_losses) / len(train_losses):.4f}; f1_macro={f1_macro:.4f}")
        print(f"{model.thresholds=}")

        if epoch < 2:
            torch.save(model.state_dict(), os.path.join(export_path, "model_epoch", f"model{epoch}.pt"))
        
        # Валидация
        model.eval()
        eval_losses, eval_metrics = [], defaultdict(lambda: defaultdict(int))
        with torch.no_grad():
            for batch in tqdm(eval_data):
                batch.to(model.device)
                outputs = model(**batch)

                eval_losses.append(outputs.loss.item())

                count_matches(
                    (torch.sigmoid(outputs.logits) > 0.5).int().tolist(),
                    batch["labels"].tolist(),
                    eval_metrics,
                )

                torch.cuda.empty_cache()
                
        metrics["eval"][epoch] = {
            unpack_label[lab]: calc_metrics(**eval_metrics[lab]) for lab in unpack_label
        }

        f1_macro = sum(m["f1-score"] for m in metrics["eval"][epoch].values()) / len(unpack_label)
        metrics["eval"][epoch]["f1_macro"] = f1_macro
        if f1_macro > f1_macro_best:
            f1_macro_best = f1_macro
            torch.save(model.state_dict(), os.path.join(export_path, "model_best", "model.pt"))

        val_loss = sum(eval_losses) / len(eval_losses)
        scheduler.step(f1_macro)
        
        print(f"{val_loss=:.4f}; {f1_macro=:.4f}; {f1_macro_best=:.4f}")

        if input() != "":
            break


finally:
    torch.save(model.state_dict(), os.path.join(export_path, "model_last", "model.pt"))

    with open(os.path.join(export_path, "metrics.json"), "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=4, ensure_ascii=False)


epoch=0


  0%|          | 0/1475 [00:00<?, ?it/s]

100%|██████████| 1475/1475 [10:42<00:00,  2.30it/s]


train loss=0.0254; f1_macro=0.4526
model.thresholds=tensor([0.5009, 0.5007, 0.5005, 0.5005, 0.5006, 0.5007, 0.5007, 0.5012, 0.5008,
        0.5005, 0.5006, 0.5004], device='cuda:0', grad_fn=<SigmoidBackward0>)


100%|██████████| 185/185 [00:22<00:00,  8.32it/s]


val_loss=0.0199; f1_macro=0.5559; f1_macro_best=0.5559

epoch=1


100%|██████████| 1475/1475 [10:42<00:00,  2.30it/s]


train loss=0.0184; f1_macro=0.6279
model.thresholds=tensor([0.5017, 0.5013, 0.5007, 0.5009, 0.5010, 0.5012, 0.5012, 0.5022, 0.5014,
        0.5010, 0.5014, 0.5006], device='cuda:0', grad_fn=<SigmoidBackward0>)


100%|██████████| 185/185 [00:22<00:00,  8.37it/s]


val_loss=0.0200; f1_macro=0.5906; f1_macro_best=0.5906
