In [10]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Callback

import pandas as pd

from transformers import BertTokenizer, BertForSequenceClassification
from huggingface_hub import PyTorchModelHubMixin

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

import wandb

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# BERT Classifier

Our BERT classifier for the Data Chatbot.

**Relevant Resources**

- https://docs.wandb.ai/guides/integrations/lightning#logger-arguments
- https://pytorch-lightning.readthedocs.io/en/0.9.0/hyperparameters.html

## Define Dataset & DataLoader

In [11]:
class ClassifierDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.labels = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split()) # Removes any extra whitespace

        # https://huggingface.co/docs/transformers/v4.34.0/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.__call__
        inputs = self.tokenizer(
            text,
            None,
            add_special_tokens=True, # Add '[CLS]' and '[SEP]', default True
            max_length=self.max_len, # Maximum length to use by one of the truncation/padding parameters
            padding='max_length', # Pad to a maximum length specified with the argument max_length
            truncation=True, # Truncate to a maximum length specified with the argument max_length
        )
        ids = inputs['input_ids'] # Indices of input sequence tokens in the vocabulary
        mask = inputs['attention_mask'] # Mask to avoid performing attention on padding token indices
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(self.labels[index], dtype=torch.long)
        }
    

# Load data and return DataLoader
def get_dataloader(df, tokenizer, max_len=None, batch_size=32, shuffle=True, nobatch=False):
    """
    Loads data into a PyTorch DataLoader object.

    Parameters:
    - df (pd.DataFrame): The data frame containing the text and labels.
    - tokenizer (Tokenizer): The tokenizer to be used.
    - max_len (int, optional): The maximum length for the tokenized sequences. Defaults to None (model's limitation).
    - batch_size (int, optional): The size of each batch. Defaults to 32.
    - shuffle (bool, optional): Whether to shuffle the data. Defaults to True.
    - nobatch (bool, optional): Whether to disable batching. If True, batch_size will be set to the length of df. Defaults to False.

    Returns:
    - DataLoader: A PyTorch DataLoader object containing the tokenized data.

    Notes:
    - The label mapping {'harm': 0, 'question': 1, 'concern': 2} is applied to the labels in df.
    """
    label_mapping = {'harm': 0, 'question': 1, 'concern': 2}
    df['label'] = df['label'].map(label_mapping)
    dataset = ClassifierDataset(df, tokenizer, max_len)

    # Handle nobatch
    batch_size = batch_size if not nobatch else df.__len__()
    print(f"DataLoader | No Batch: {nobatch}; Batch Size: {batch_size}")

    # Create DataLoader
    params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_workers': 0}
    data_loader = DataLoader(dataset, **params)
    return data_loader

## Define Model

In [16]:
class BERTClassifier(pl.LightningModule, PyTorchModelHubMixin):
    def __init__(self, hparams):
        super(BERTClassifier, self).__init__()

        # Save hyperparameters
        self.hparams.update(hparams)
        self.__configure_from_hyperparams()

        self.model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
    
    def forward(self, ids, mask):
        output = self.model(ids, attention_mask=mask)
        return output.logits

    def training_step(self, batch, batch_nb):
        return self.__step(batch, batch_nb, 'train')

    def validation_step(self, batch, batch_nb):
        return self.__step(batch, batch_nb, 'val')

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-5)

    def save_pretrained(self, *args, **kwargs):
        # Forward all arguments to the inner Bert model's save_pretrained method
        self.model.save_pretrained(*args, **kwargs)

    def push_to_hub(self, *args, **kwargs):
        # Forward all arguments to the inner Bert model's push_to_hub method
        self.model.push_to_hub(*args, **kwargs)

    def __configure_from_hyperparams(self):
        # Set N/A hyperparameters to default values
        self.max_len = self.hparams.get("max_len", 100)
        self.batch_size =  self.hparams.get("batch_size", 32)

    def __step(self, batch, batch_idx, stage):
        preds, loss, accuracy, f1 = self.__get_preds_loss_accuracy(batch)
        
        self.log(
            f'{stage}/accuracy',
            accuracy,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        self.log(
            f'{stage}/f1',
            accuracy,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        self.log(f'{stage}/loss', loss)

        return loss

    def __get_preds_loss_accuracy(self, batch):
        # Helper function to get predictions and loss
        ids = batch['ids']
        mask = batch['mask']
        labels = batch['labels']
        
        preds = self(ids, mask)
        loss = torch.nn.CrossEntropyLoss()(preds, labels)

        # Calculate accuracy
        _, predicted = torch.max(preds, 1)
        correct = (predicted == labels).sum().item()
        total = labels.size(0)
        accuracy = correct / total

        # Calculate F1 score
        labels_cpu = labels.cpu().numpy()
        predicted_cpu = predicted.cpu().numpy()
        f1 = f1_score(labels_cpu, predicted_cpu, average='macro')

        return preds, loss, accuracy, f1

## Helper Functions

In [26]:
class ConfusionMatrixLogger(Callback):
    def __init__(self):
        super().__init__()
        self.preds = []
        self.targets = []

    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        labels = batch['labels']
        preds = pl_module(ids, mask)
        ground_truth_ids = labels.flatten().cpu().numpy()

        self.preds.extend(preds.cpu().numpy())
        self.targets.extend(ground_truth_ids)

    def on_validation_epoch_end(self, trainer, pl_module):
        all_labels = ['harm', 'question', 'concern']

        # Log the confusion matrix
        wandb.log({"confusion_matrix": wandb.plot.confusion_matrix(
            preds=np.argmax(np.array(self.preds), axis=1), y_true=self.targets, class_names=all_labels)
        })

        # Log the ROC curve
        probabilites = torch.nn.functional.softmax(torch.tensor(self.preds), dim=1)
        print(probabilites)
        wandb.log({"roc" : wandb.plot.roc_curve(
            y_true=self.targets, y_probas=probabilites, labels=all_labels, classes_to_plot=None)
        })

        # Clear for the next epoch
        self.preds = []
        self.targets = []

In [14]:
def finetune_bert(run_config, train_dataloader, val_dataloader):
    # WandB initialization 
    wandb.login()

    # Initialize model
    model = BERTClassifier(hparams=run_config)

    # Initialize WandbLogger
    wandb_logger = WandbLogger(entity='yvokeller', project='data-chatbot') # log_model='all'
    wandb_logger.experiment.config.update(run_config)

    # Create an instance of the ConfusionMatrixLogger class
    confusion_matrix_logger = ConfusionMatrixLogger()

    # Initialize Trainer
    trainer = pl.Trainer(
        max_epochs=run_config.get('epochs'), 
        logger=wandb_logger,
        callbacks=[confusion_matrix_logger],
        log_every_n_steps=1, 
        enable_progress_bar=True,
    )

    # Train the model
    trainer.fit(model, train_dataloader, val_dataloader)

    # Close WandB logger
    wandb.finish()

    return model

## Fine-tune BERT Model

### POC with Simple Demo Dataset

In [8]:
# WandB initialization 
wandb.login()

# Config
run_config = {
    'epochs': 5,
    'max_len': 100,
    'batch_size': 32
}

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load training and validation data
train_df = pd.read_csv('data/train.csv')
train_dataloader = get_dataloader(train_df, tokenizer, max_len=run_config.get('max_len'), batch_size=run_config.get('batch_size'))

val_df = pd.read_csv('data/val.csv')
val_dataloader = get_dataloader(val_df, tokenizer, run_config.get('max_len'), batch_size=run_config.get('batch_size'), shuffle=False, nobatch=True)

# Finetune BERT
model = finetune_bert(run_config, train_dataloader, val_dataloader)



DataLoader | No Batch: False; Batch Size: 32
DataLoader | No Batch: True; Batch Size: 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 177 M 
--------------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.423   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.




VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▃▃▃▅▅▅▆▆▆███
train/accuracy_epoch,██▁▁▁
train/accuracy_step,██▁▁▁
train/f1_epoch,██▁▁▁
train/f1_step,██▁▁▁
train/loss,█▆▅▄▁
trainer/global_step,▁▁▁▁▁▁▃▃▃▃▃▃▅▅▅▅▅▅▆▆▆▆▆▆██████
val/accuracy_epoch,▁▁▁▁▁
val/accuracy_step,▁▁▁▁▁
val/f1_epoch,▁▁▁▁▁

0,1
epoch,4.0
train/accuracy_epoch,0.4
train/accuracy_step,0.4
train/f1_epoch,0.4
train/f1_step,0.4
train/loss,0.92182
trainer/global_step,4.0
val/accuracy_epoch,0.33333
val/accuracy_step,0.33333
val/f1_epoch,0.33333


### Overfit on a small sample

In [27]:
from sklearn.model_selection import train_test_split

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Config
run_config = {
    'epochs': 1,
    'max_len': None,
    'batch_size': 16
}

# Load training and validation data
train_df = pd.read_parquet('../data/train.parquet')
train_df, _ = train_test_split(train_df, train_size=64, random_state=42, stratify=train_df['label'])
train_df = train_df.reset_index(drop=True)
print('train', train_df.shape)
train_dataloader = get_dataloader(train_df, tokenizer, max_len=run_config.get('max_len'), batch_size=run_config.get('batch_size'))

val_df = pd.read_parquet('../data/test.parquet')
val_df, _ = train_test_split(val_df, train_size=64, random_state=42, stratify=val_df['label'])
val_df = val_df.reset_index(drop=True)
print('val', val_df.shape)
val_dataloader = get_dataloader(val_df, tokenizer, run_config.get('max_len'), batch_size=run_config.get('batch_size'), shuffle=False, nobatch=True)

# Fine-tune BERT
model = finetune_bert(run_config, train_dataloader, val_dataloader)

train (64, 2)
DataLoader | No Batch: False; Batch Size: 16
val (64, 2)
DataLoader | No Batch: True; Batch Size: 64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 177 M 
--------------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.423   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


tensor([[0.3524, 0.3486, 0.2990],
        [0.3298, 0.3867, 0.2834],
        [0.3669, 0.3367, 0.2964],
        [0.3714, 0.3438, 0.2848],
        [0.3378, 0.3696, 0.2926],
        [0.3333, 0.3819, 0.2848],
        [0.3603, 0.3460, 0.2937],
        [0.3535, 0.3500, 0.2964],
        [0.3456, 0.3515, 0.3029],
        [0.3623, 0.3531, 0.2845],
        [0.3641, 0.3497, 0.2863],
        [0.3705, 0.3405, 0.2889],
        [0.3323, 0.3704, 0.2973],
        [0.3550, 0.3541, 0.2909],
        [0.3448, 0.3396, 0.3155],
        [0.3526, 0.3588, 0.2887],
        [0.3756, 0.3485, 0.2759],
        [0.3261, 0.3806, 0.2933],
        [0.3753, 0.3517, 0.2730],
        [0.3193, 0.3810, 0.2997],
        [0.3493, 0.3600, 0.2907],
        [0.3349, 0.3731, 0.2920],
        [0.3226, 0.4022, 0.2751],
        [0.3533, 0.3544, 0.2923],
        [0.3232, 0.3815, 0.2953],
        [0.3342, 0.3754, 0.2904],
        [0.3951, 0.3297, 0.2752],
        [0.3593, 0.3471, 0.2935],
        [0.3807, 0.3387, 0.2806],
        [0.387

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[0.3617, 0.3104, 0.3279],
        [0.2693, 0.3285, 0.4021],
        [0.3356, 0.3541, 0.3102],
        [0.3719, 0.3228, 0.3053],
        [0.3119, 0.3483, 0.3398],
        [0.2767, 0.3271, 0.3962],
        [0.3219, 0.3098, 0.3683],
        [0.3131, 0.3732, 0.3137],
        [0.3110, 0.3352, 0.3538],
        [0.3703, 0.3199, 0.3099],
        [0.3647, 0.3312, 0.3040],
        [0.3100, 0.3802, 0.3098],
        [0.2806, 0.3277, 0.3917],
        [0.3550, 0.3289, 0.3161],
        [0.3375, 0.3117, 0.3507],
        [0.3109, 0.3806, 0.3085],
        [0.3172, 0.3887, 0.2942],
        [0.2759, 0.3372, 0.3870],
        [0.3485, 0.3591, 0.2924],
        [0.2696, 0.3431, 0.3873],
        [0.3229, 0.3406, 0.3364],
        [0.2820, 0.3420, 0.3760],
        [0.2718, 0.3544, 0.3738],
        [0.3691, 0.3192, 0.3118],
        [0.2673, 0.3298, 0.4029],
        [0.2946, 0.3359, 0.3695],
        [0.3349, 0.3802, 0.2849],
        [0.3357, 0.3244, 0.3400],
        [0.3278, 0.3745, 0.2977],
        [0.326

`Trainer.fit` stopped: `max_epochs=1` reached.


VBox(children=(Label(value='0.008 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.938481…



0,1
epoch,▁▁▁▁▁▁
train/accuracy_epoch,▁
train/accuracy_step,▁▅▃█
train/f1_epoch,▁
train/f1_step,▁▅▃█
train/loss,█▇▄▁
trainer/global_step,▁▃▆█▁▁▁██
val/accuracy_epoch,▁
val/accuracy_step,▁
val/f1_epoch,▁

0,1
epoch,0.0
train/accuracy_epoch,0.48438
train/accuracy_step,0.6875
train/f1_epoch,0.48438
train/f1_step,0.6875
train/loss,1.03053
trainer/global_step,3.0
val/accuracy_epoch,0.90625
val/accuracy_step,0.90625
val/f1_epoch,0.90625


### Train on the full dataset

In [7]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Config
run_config = {
    'epochs': 6,
    'max_len': None,
    'batch_size': 32
}

# Load training and validation data
train_df = pd.read_parquet('../data/train.parquet')
train_df = train_df.reset_index(drop=True)
print('train', train_df.shape)
train_dataloader = get_dataloader(train_df, tokenizer, max_len=run_config.get('max_len'), batch_size=run_config.get('batch_size'))

val_df = pd.read_parquet('../data/test.parquet')
val_df = val_df.reset_index(drop=True)
print('val', val_df.shape)
val_dataloader = get_dataloader(val_df, tokenizer, run_config.get('max_len'), batch_size=run_config.get('batch_size'), shuffle=False, nobatch=True)

model = finetune_bert(run_config, train_dataloader, val_dataloader)

train (1074, 2)
DataLoader | No Batch: False; Batch Size: 32
val (264, 2)
DataLoader | No Batch: True; Batch Size: 264


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011132615278216285, max=1.0…

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 177 M 
--------------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.423   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=6` reached.




0,1
epoch,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▄▄▄▄▄▄▅▅▅▅▅▅▅▇▇▇▇▇▇███████
train/accuracy,▁▇████
train/loss,█▇▇▅▄▄▃▃▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val/accuracy,▁▇█▇█▇
val/loss,█▂▁▂▁▂

0,1
epoch,5.0
train/accuracy,0.99907
train/loss,0.00827
trainer/global_step,203.0
val/accuracy,0.98485
val/loss,0.06679


**Results**

The model achieved a validation accuracy of 97.8% after training. We are concerned that it overfits on the question mark for classifying questions, which we need to investigate further.

# Model Persistence

## Persist model locally

For evaluation purposes, we want to persist the model locally.

In [34]:
model.save_pretrained('bert-classifier/model')
tokenizer.save_pretrained('bert-classifier/tokenizer')

('bert-classifier/tokenizer/tokenizer_config.json',
 'bert-classifier/tokenizer/special_tokens_map.json',
 'bert-classifier/tokenizer/vocab.txt',
 'bert-classifier/tokenizer/added_tokens.json')

In [35]:
# Load model locally
loaded_model = BertForSequenceClassification.from_pretrained('bert-classifier/model', num_labels=3)
loaded_tokenizer = BertTokenizer.from_pretrained('bert-classifier/tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Persist model on HuggingFace

Our BERT classifier for the Data Chatbot is published here: https://huggingface.co/nlpchallenges/Text-Classification/tree/main

Before pushing to the hub for the first time, make sure to run `huggingface-cli login` in your terminal, and paste the API token from your HuggingFace profile.

In [20]:
model.push_to_hub("nlpchallenges/Text-Classification")
tokenizer.push_to_hub("nlpchallenges/Text-Classification")

pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nlpchallenges/Text-Classification/commit/eb0bf2fb07564a5156f0f1c507e670a896368011', commit_message='Upload tokenizer', commit_description='', oid='eb0bf2fb07564a5156f0f1c507e670a896368011', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
loaded_tokenizer = BertTokenizer.from_pretrained('nlpchallenges/Text-Classification')
loaded_model = BertForSequenceClassification.from_pretrained('nlpchallenges/Text-Classification')

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

# Inference

In [22]:
# Inference
def classify_text(user_input, model, tokenizer, max_len=100):
    # Tokenize the user input
    inputs = tokenizer(
        user_input,
        None,
        add_special_tokens=True, # Add '[CLS]' and '[SEP]', default True
        max_length=max_len, # Maximum length to use by one of the truncation/padding parameters
        padding='max_length', # Pad to a maximum length specified with the argument max_length
        truncation=True, # Truncate to a maximum length specified with the argument max_length
    )

    ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0) # Indices of input sequence tokens in the vocabulary
    mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0) # Mask to avoid performing attention on padding token indices
    
    
    # Get model output
    model.eval()
    with torch.no_grad():
        output = model(ids, attention_mask=mask)
    
    # Get predicted label index
    _, predicted_idx = torch.max(output.logits, 1)
    
    # Map index to label
    label_mapping = {0: 'harm', 1: 'question', 2: 'concern'}
    return label_mapping[predicted_idx.item()]


In [25]:
classify_text("Kannst du diese Frage beantworten?", loaded_model, loaded_tokenizer, max_len=None)

'question'