In [8]:
import pandas as pd
import numpy as np
import torch 
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tokenizers import Tokenizer

import pytorch_lightning as pl

from IPython.display import display
from typing import List, Dict, Any

In [54]:
class DataFrameDataset(Dataset):
    def __init__(self,
                 tokenizer: Tokenizer, 
                 df: pd.DataFrame,
                 label_columns: List[str],
                 max_length: int = 256,
                 padding: str = "max_length") -> None:
        super().__init__()
        inputs = tokenizer(df.comments.to_list(), padding=padding, max_length=max_length, truncation=True, return_tensors="pt")
        self.input_ids = inputs["input_ids"]
        self.attention_masks = inputs["attention_mask"]
        self.label_columns = label_columns
        self.labels = torch.from_numpy(df[label_columns].values.astype(np.float32))

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, index: Any) -> Dict:
        return self.input_ids[index], self.attention_masks[index], self.labels[index]

    def dataloader(self, **kwargs) -> DataLoader:
        return DataLoader(self, **kwargs)

In [11]:
huggingface_model_name = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name)

In [55]:
max_length = 64
batch_size = 16
label_columns = ["hate_hate", "hate_none", "hate_offensive"]
train_dl = DataFrameDataset(tokenizer, pd.read_csv("data/khs_train.csv"), label_columns=label_columns, max_length=max_length) \
                        .dataloader(batch_size=batch_size)
val_dl = DataFrameDataset(tokenizer, pd.read_csv("data/khs_dev.csv"), label_columns=label_columns, max_length=max_length) \
                        .dataloader(batch_size=batch_size)

                                                                   

In [105]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics.functional as tm
from sklearn.metrics import multilabel_confusion_matrix


def join_step_outputs(outputs):
    result = {}
    keys = outputs[0].keys()
    for k in keys:
        # for x in outputs:
        #     x = x[k]
        #     print(k, x.shape, x)
        X = [x[k] for x in outputs]
        if X[0].dim() == 0: # zero-dim tensor
            result[k] = torch.stack(X)
        else:
            result[k] = torch.cat(X, dim=0)
    return result

# https://stackoverflow.com/questions/61524717/pytorch-how-to-find-accuracy-for-multi-label-classification
def multilabel_accuracy(logits: torch.Tensor, labels: torch.Tensor, threshold: int=0.5) -> torch.Tensor:
    N, C = logits.shape
    logits = (logits >= threshold).int()
    acc = (logits == labels).sum(dim=0) / N

    return acc



class TextClassificationModule(pl.LightningModule):
    def __init__(self, huggingface_model_name, num_labels, labels, lr=5e-4):
        super().__init__()
        self.save_hyperparameters()
        self.model = AutoModelForSequenceClassification.from_pretrained(huggingface_model_name, num_labels=num_labels)
        self.loss_fn = nn.BCELoss()
        self.labels = labels
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.hparams.lr)

    def forward(self, input_ids, attention_mask=None):
        logits = self.model(input_ids, attention_mask=attention_mask).logits.sigmoid()
        return logits

    def training_step(self, batch, batch_idx):
        ids, masks, labels = batch
        logits = self(ids, masks)
        loss = self.loss_fn(logits, labels)
        output = {"loss": loss, "logits": logits, "labels": labels}
        return output
    
    def training_epoch_end(self, outputs):
        outputs = join_step_outputs(outputs)
        loss = outputs["loss"].mean()
        self.log("train_epoch_loss", loss)

        logits = outputs["logits"]
        labels = outputs["labels"]
        acc = multilabel_accuracy(logits, labels)
        for label, label_acc in zip(self.labels, acc):
            self.log(f"train_{label}_acc", label_acc, prog_bar=True)
        
    def validation_step(self, batch, batch_idx):
        ids, masks, labels = batch
        logits = self(ids, masks)
        loss = self.loss_fn(logits, labels)
        output = {"loss": loss, "logits": logits, "labels": labels}
        return output
    
    def validation_epoch_end(self, outputs):
        outputs = join_step_outputs(outputs)
        loss = outputs["loss"].mean()
        self.log("val_epoch_loss", loss, prog_bar=True)

        logits = outputs["logits"]
        labels = outputs["labels"]
        acc = multilabel_accuracy(logits, labels)
        for label, label_acc in zip(self.labels, acc):
            self.log(f"val_{label}_acc", label_acc, prog_bar=True)



In [106]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
            
# save_dir = "./ckpt/"
# checkpoint_callback = ModelCheckpoint(
#     monitor="val_epoch_loss",
#     dirpath=save_dir,
#     filename=f"{config.name}" + "-{epoch:02d}-{val_epoch_loss:.4f}",
#     mode="min",
# )

module = TextClassificationModule(
    huggingface_model_name,
    num_labels=len(label_columns),
    labels=label_columns,
    lr=5e-4
)
callbacks = [
    EarlyStopping("val_epoch_loss", mode="min")
]

trainer = pl.Trainer(max_epochs=1000, 
                  gpus=1 if torch.cuda.is_available() else 0,
                  callbacks=callbacks)
trainer.fit(module, train_dl, val_dl)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 524/524 [23:29<00:00,  2.69s/it, loss=0.608, v_num=18] val_epoch_loss=0.747, val_hate_hate_acc=0.741, val_hate_none_acc=0.340, val_hate_offensive_acc=0.599, train_hate_hate_acc=0.757, train_hate_none_acc=0.540, train_hate_offensive_acc=0.677]  
Epoch 0:  11%|█         | 57/524 [21:44<2:58:07, 22.89s/it, loss=0.632, v_num=19]
Epoch 0: 100%|██████████| 49/49 [20:51<00:00, 25.55s/it, loss=0.652, v_num=21]
Epoch 0: 100%|██████████| 4/4 [21:30<00:00, 322.57s/it, loss=0.737, v_num=20]
Epoch 0: 100%|██████████| 49/49 [18:45<00:00, 22.97s/it, loss=0.645, v_num=22]
Epoch 0: 100%|██████████| 49/49 [16:58<00:00, 20.78s/it, loss=0.641, v_num=23]
Epoch 0: 100%|██████████| 49/49 [16:00<00:00, 19.61s/it, loss=0.646, v_num=24]
Epoch 0: 100%|██████████| 49/49 [12:59<00:00, 15.91s/it, loss=0.647, v_num=25]
Epoch 0: 100%|██████████| 49/49 [12:27<00:00, 15.25s/it, loss=0.642, v_num=26]
Epoch 0: 100%|██████████| 49/49 [08:14<00:00, 10.09s/it, loss=0.648, v_num=28]
Epoch 0: 100%|██