In [10]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Callback

import pandas as pd

from transformers import BertTokenizer, BertForSequenceClassification
from huggingface_hub import PyTorchModelHubMixin

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

import wandb

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# BERT Classifier

Our BERT classifier for the Data Chatbot.

**Relevant Resources**

- https://docs.wandb.ai/guides/integrations/lightning#logger-arguments
- https://pytorch-lightning.readthedocs.io/en/0.9.0/hyperparameters.html

## Define Dataset & DataLoader

In [11]:
class ClassifierDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.labels = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split()) # Removes any extra whitespace

        # https://huggingface.co/docs/transformers/v4.34.0/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.__call__
        inputs = self.tokenizer(
            text,
            None,
            add_special_tokens=True, # Add '[CLS]' and '[SEP]', default True
            max_length=self.max_len, # Maximum length to use by one of the truncation/padding parameters
            padding='max_length', # Pad to a maximum length specified with the argument max_length
            truncation=True, # Truncate to a maximum length specified with the argument max_length
        )
        ids = inputs['input_ids'] # Indices of input sequence tokens in the vocabulary
        mask = inputs['attention_mask'] # Mask to avoid performing attention on padding token indices
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(self.labels[index], dtype=torch.long)
        }
    

# Load data and return DataLoader
def get_dataloader(df, tokenizer, max_len=None, batch_size=32, shuffle=True, nobatch=False):
    """
    Loads data into a PyTorch DataLoader object.

    Parameters:
    - df (pd.DataFrame): The data frame containing the text and labels.
    - tokenizer (Tokenizer): The tokenizer to be used.
    - max_len (int, optional): The maximum length for the tokenized sequences. Defaults to None (model's limitation).
    - batch_size (int, optional): The size of each batch. Defaults to 32.
    - shuffle (bool, optional): Whether to shuffle the data. Defaults to True.
    - nobatch (bool, optional): Whether to disable batching. If True, batch_size will be set to the length of df. Defaults to False.

    Returns:
    - DataLoader: A PyTorch DataLoader object containing the tokenized data.

    Notes:
    - The label mapping {'harm': 0, 'question': 1, 'concern': 2} is applied to the labels in df.
    """
    label_mapping = {'harm': 0, 'question': 1, 'concern': 2}
    df['label'] = df['label'].map(label_mapping)
    dataset = ClassifierDataset(df, tokenizer, max_len)

    # Handle nobatch
    batch_size = batch_size if not nobatch else df.__len__()
    print(f"DataLoader | No Batch: {nobatch}; Batch Size: {batch_size}")

    # Create DataLoader
    params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_workers': 0}
    data_loader = DataLoader(dataset, **params)
    return data_loader

## Define Model

In [16]:
class BERTClassifier(pl.LightningModule, PyTorchModelHubMixin):
    def __init__(self, hparams):
        super(BERTClassifier, self).__init__()

        # Save hyperparameters
        self.hparams.update(hparams)
        self.__configure_from_hyperparams()

        self.model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
    
    def forward(self, ids, mask):
        output = self.model(ids, attention_mask=mask)
        return output.logits

    def training_step(self, batch, batch_nb):
        return self.__step(batch, batch_nb, 'train')

    def validation_step(self, batch, batch_nb):
        return self.__step(batch, batch_nb, 'val')

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-5)

    def save_pretrained(self, *args, **kwargs):
        # Forward all arguments to the inner Bert model's save_pretrained method
        self.model.save_pretrained(*args, **kwargs)

    def push_to_hub(self, *args, **kwargs):
        # Forward all arguments to the inner Bert model's push_to_hub method
        self.model.push_to_hub(*args, **kwargs)

    def __configure_from_hyperparams(self):
        # Set N/A hyperparameters to default values
        self.max_len = self.hparams.get("max_len", 100)
        self.batch_size =  self.hparams.get("batch_size", 32)

    def __step(self, batch, batch_idx, stage):
        preds, loss, accuracy, f1 = self.__get_preds_loss_accuracy(batch)
        
        self.log(
            f'{stage}/accuracy',
            accuracy,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        self.log(
            f'{stage}/f1',
            f1,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        self.log(f'{stage}/loss', loss)

        return loss

    def __get_preds_loss_accuracy(self, batch):
        # Helper function to get predictions and loss
        ids = batch['ids']
        mask = batch['mask']
        labels = batch['labels']
        
        preds = self(ids, mask)
        loss = torch.nn.CrossEntropyLoss()(preds, labels)

        # Calculate accuracy
        _, predicted = torch.max(preds, 1)
        correct = (predicted == labels).sum().item()
        total = labels.size(0)
        accuracy = correct / total

        # Calculate F1 score
        labels_cpu = labels.cpu().numpy()
        predicted_cpu = predicted.cpu().numpy()
        f1 = f1_score(labels_cpu, predicted_cpu, average='macro')

        return preds, loss, accuracy, f1

## Helper Functions

In [26]:
class ConfusionMatrixLogger(Callback):
    def __init__(self):
        super().__init__()
        self.preds = []
        self.targets = []

    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        labels = batch['labels']
        preds = pl_module(ids, mask)
        ground_truth_ids = labels.flatten().cpu().numpy()

        self.preds.extend(preds.cpu().numpy())
        self.targets.extend(ground_truth_ids)

    def on_validation_epoch_end(self, trainer, pl_module):
        all_labels = ['harm', 'question', 'concern']

        # Log the confusion matrix
        wandb.log({"confusion_matrix": wandb.plot.confusion_matrix(
            preds=np.argmax(np.array(self.preds), axis=1), y_true=self.targets, class_names=all_labels)
        })

        # Log the ROC curve
        probabilites = torch.nn.functional.softmax(torch.tensor(self.preds), dim=1)
        print(probabilites)
        wandb.log({"roc" : wandb.plot.roc_curve(
            y_true=self.targets, y_probas=probabilites, labels=all_labels, classes_to_plot=None)
        })

        # Clear for the next epoch
        self.preds = []
        self.targets = []

In [14]:
def finetune_bert(run_config, train_dataloader, val_dataloader):
    # WandB initialization 
    wandb.login()

    # Initialize model
    model = BERTClassifier(hparams=run_config)

    # Initialize WandbLogger
    wandb_logger = WandbLogger(entity='yvokeller', project='data-chatbot') # log_model='all'
    wandb_logger.experiment.config.update(run_config)

    # Create an instance of the ConfusionMatrixLogger class
    confusion_matrix_logger = ConfusionMatrixLogger()

    # Initialize Trainer
    trainer = pl.Trainer(
        max_epochs=run_config.get('epochs'), 
        logger=wandb_logger,
        callbacks=[confusion_matrix_logger],
        log_every_n_steps=1, 
        enable_progress_bar=True,
    )

    # Train the model
    trainer.fit(model, train_dataloader, val_dataloader)

    # Close WandB logger
    wandb.finish()

    return model

## Fine-tune BERT Model

### POC with Simple Demo Dataset

In [8]:
# WandB initialization 
wandb.login()

# Config
run_config = {
    'epochs': 5,
    'max_len': 100,
    'batch_size': 32
}

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Load training and validation data
train_df = pd.read_csv('data/train.csv')
train_dataloader = get_dataloader(train_df, tokenizer, max_len=run_config.get('max_len'), batch_size=run_config.get('batch_size'))

val_df = pd.read_csv('data/val.csv')
val_dataloader = get_dataloader(val_df, tokenizer, run_config.get('max_len'), batch_size=run_config.get('batch_size'), shuffle=False, nobatch=True)

# Finetune BERT
model = finetune_bert(run_config, train_dataloader, val_dataloader)



DataLoader | No Batch: False; Batch Size: 32
DataLoader | No Batch: True; Batch Size: 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 177 M 
--------------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.423   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.




VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▃▃▃▅▅▅▆▆▆███
train/accuracy_epoch,██▁▁▁
train/accuracy_step,██▁▁▁
train/f1_epoch,██▁▁▁
train/f1_step,██▁▁▁
train/loss,█▆▅▄▁
trainer/global_step,▁▁▁▁▁▁▃▃▃▃▃▃▅▅▅▅▅▅▆▆▆▆▆▆██████
val/accuracy_epoch,▁▁▁▁▁
val/accuracy_step,▁▁▁▁▁
val/f1_epoch,▁▁▁▁▁

0,1
epoch,4.0
train/accuracy_epoch,0.4
train/accuracy_step,0.4
train/f1_epoch,0.4
train/f1_step,0.4
train/loss,0.92182
trainer/global_step,4.0
val/accuracy_epoch,0.33333
val/accuracy_step,0.33333
val/f1_epoch,0.33333


### Overfit on a small sample

In [29]:
from sklearn.model_selection import train_test_split

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Config
run_config = {
    'epochs': 10,
    'max_len': None,
    'batch_size': 16
}

# Load training and validation data
train_df = pd.read_parquet('../data/train.parquet')
train_df, _ = train_test_split(train_df, train_size=64, random_state=42, stratify=train_df['label'])
train_df = train_df.reset_index(drop=True)
print('train', train_df.shape)
train_dataloader = get_dataloader(train_df, tokenizer, max_len=run_config.get('max_len'), batch_size=run_config.get('batch_size'))

val_df = pd.read_parquet('../data/test.parquet')
val_df, _ = train_test_split(val_df, train_size=64, random_state=42, stratify=val_df['label'])
val_df = val_df.reset_index(drop=True)
print('val', val_df.shape)
val_dataloader = get_dataloader(val_df, tokenizer, run_config.get('max_len'), batch_size=run_config.get('batch_size'), shuffle=False, nobatch=True)

# Fine-tune BERT
model = finetune_bert(run_config, train_dataloader, val_dataloader)

train (64, 2)
DataLoader | No Batch: False; Batch Size: 16
val (64, 2)
DataLoader | No Batch: True; Batch Size: 64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 177 M 
--------------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.423   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


tensor([[0.2829, 0.4150, 0.3021],
        [0.2615, 0.4346, 0.3039],
        [0.2804, 0.4186, 0.3010],
        [0.2493, 0.4691, 0.2816],
        [0.2803, 0.4108, 0.3089],
        [0.2742, 0.4362, 0.2896],
        [0.2442, 0.4593, 0.2965],
        [0.2814, 0.4083, 0.3103],
        [0.2695, 0.4215, 0.3089],
        [0.2784, 0.4222, 0.2994],
        [0.2697, 0.4309, 0.2994],
        [0.2644, 0.4446, 0.2910],
        [0.2559, 0.4373, 0.3069],
        [0.2569, 0.4530, 0.2901],
        [0.2545, 0.4522, 0.2932],
        [0.2775, 0.4073, 0.3152],
        [0.2669, 0.4440, 0.2891],
        [0.2664, 0.4324, 0.3012],
        [0.2858, 0.4220, 0.2923],
        [0.2734, 0.4374, 0.2892],
        [0.2737, 0.4481, 0.2782],
        [0.2687, 0.4430, 0.2883],
        [0.2737, 0.4104, 0.3159],
        [0.2857, 0.4193, 0.2950],
        [0.2696, 0.4331, 0.2974],
        [0.2783, 0.4325, 0.2891],
        [0.2581, 0.4601, 0.2818],
        [0.2756, 0.4235, 0.3009],
        [0.2600, 0.4397, 0.3002],
        [0.257

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


tensor([[0.3421, 0.3239, 0.3340],
        [0.2976, 0.2749, 0.4275],
        [0.2625, 0.4304, 0.3071],
        [0.2740, 0.4196, 0.3064],
        [0.3103, 0.3158, 0.3739],
        [0.3204, 0.2866, 0.3930],
        [0.2617, 0.3770, 0.3613],
        [0.2382, 0.4637, 0.2981],
        [0.2890, 0.3381, 0.3728],
        [0.3136, 0.3413, 0.3450],
        [0.2953, 0.3777, 0.3271],
        [0.2099, 0.5159, 0.2742],
        [0.3051, 0.2612, 0.4337],
        [0.2947, 0.3606, 0.3447],
        [0.2644, 0.4154, 0.3202],
        [0.2425, 0.4438, 0.3137],
        [0.2277, 0.4841, 0.2882],
        [0.2812, 0.3162, 0.4026],
        [0.2747, 0.4133, 0.3120],
        [0.2781, 0.3506, 0.3713],
        [0.2932, 0.3714, 0.3354],
        [0.3175, 0.2840, 0.3985],
        [0.2964, 0.3042, 0.3994],
        [0.3530, 0.3249, 0.3221],
        [0.3088, 0.2685, 0.4227],
        [0.2991, 0.2980, 0.4029],
        [0.2094, 0.5312, 0.2594],
        [0.3086, 0.3275, 0.3639],
        [0.2101, 0.5077, 0.2822],
        [0.232

Validation: 0it [00:00, ?it/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


tensor([[0.4665, 0.2218, 0.3116],
        [0.3802, 0.1629, 0.4569],
        [0.2383, 0.4972, 0.2645],
        [0.3539, 0.3349, 0.3112],
        [0.3989, 0.1937, 0.4074],
        [0.4103, 0.1784, 0.4114],
        [0.3026, 0.2805, 0.4170],
        [0.2014, 0.5667, 0.2319],
        [0.4566, 0.1407, 0.4027],
        [0.4468, 0.2231, 0.3300],
        [0.3798, 0.3226, 0.2977],
        [0.1867, 0.5800, 0.2333],
        [0.4398, 0.1290, 0.4312],
        [0.4581, 0.1948, 0.3471],
        [0.3038, 0.3821, 0.3141],
        [0.2202, 0.5114, 0.2684],
        [0.1935, 0.5696, 0.2369],
        [0.3579, 0.1922, 0.4499],
        [0.2535, 0.4800, 0.2665],
        [0.3205, 0.2549, 0.4246],
        [0.3497, 0.2921, 0.3582],
        [0.4525, 0.1375, 0.4100],
        [0.3813, 0.2001, 0.4186],
        [0.4840, 0.2286, 0.2874],
        [0.4367, 0.1374, 0.4259],
        [0.4002, 0.1599, 0.4399],
        [0.1859, 0.5953, 0.2188],
        [0.3863, 0.2146, 0.3991],
        [0.1816, 0.5929, 0.2255],
        [0.200

Validation: 0it [00:00, ?it/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


tensor([[0.5353, 0.1706, 0.2940],
        [0.3918, 0.1237, 0.4845],
        [0.1881, 0.6110, 0.2009],
        [0.4364, 0.2654, 0.2982],
        [0.4589, 0.1321, 0.4090],
        [0.4111, 0.1336, 0.4554],
        [0.2969, 0.2444, 0.4587],
        [0.1648, 0.6522, 0.1829],
        [0.4052, 0.2185, 0.3762],
        [0.5357, 0.1747, 0.2896],
        [0.4287, 0.3262, 0.2451],
        [0.1549, 0.6613, 0.1838],
        [0.4585, 0.1128, 0.4288],
        [0.4564, 0.2226, 0.3210],
        [0.3405, 0.3501, 0.3095],
        [0.1784, 0.6180, 0.2036],
        [0.1591, 0.6564, 0.1845],
        [0.3628, 0.1333, 0.5039],
        [0.2130, 0.5715, 0.2154],
        [0.3746, 0.1554, 0.4700],
        [0.4480, 0.1785, 0.3735],
        [0.4092, 0.1399, 0.4509],
        [0.3833, 0.1512, 0.4655],
        [0.5731, 0.1796, 0.2473],
        [0.4528, 0.1175, 0.4297],
        [0.4234, 0.1238, 0.4528],
        [0.1548, 0.6676, 0.1776],
        [0.4603, 0.1374, 0.4023],
        [0.1511, 0.6727, 0.1762],
        [0.162

Validation: 0it [00:00, ?it/s]

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.
wandb: Network error (ConnectionError), entering retry loop.
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


### Train on the full dataset

In [28]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Config
run_config = {
    'epochs': 6,
    'max_len': None,
    'batch_size': 32
}

# Load training and validation data
train_df = pd.read_parquet('../data/train.parquet')
train_df = train_df.reset_index(drop=True)
print('train', train_df.shape)
train_dataloader = get_dataloader(train_df, tokenizer, max_len=run_config.get('max_len'), batch_size=run_config.get('batch_size'))

val_df = pd.read_parquet('../data/test.parquet')
val_df = val_df.reset_index(drop=True)
print('val', val_df.shape)
val_dataloader = get_dataloader(val_df, tokenizer, run_config.get('max_len'), batch_size=run_config.get('batch_size'), shuffle=False, nobatch=True)

model = finetune_bert(run_config, train_dataloader, val_dataloader)

train (1074, 2)
DataLoader | No Batch: False; Batch Size: 32
val (264, 2)
DataLoader | No Batch: True; Batch Size: 264


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 177 M 
--------------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.423   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


tensor([[0.2754, 0.3049, 0.4197],
        [0.2776, 0.3188, 0.4036],
        [0.2711, 0.3209, 0.4079],
        [0.2778, 0.3133, 0.4089],
        [0.2741, 0.3072, 0.4186],
        [0.2453, 0.3437, 0.4109],
        [0.2589, 0.3094, 0.4317],
        [0.2679, 0.3197, 0.4124],
        [0.2659, 0.3073, 0.4267],
        [0.2669, 0.3281, 0.4050],
        [0.2657, 0.3050, 0.4293],
        [0.2763, 0.3103, 0.4133],
        [0.2561, 0.3140, 0.4299],
        [0.2600, 0.3124, 0.4276],
        [0.2828, 0.3166, 0.4006],
        [0.2676, 0.3259, 0.4065],
        [0.2492, 0.3166, 0.4342],
        [0.2571, 0.3171, 0.4259],
        [0.2411, 0.3000, 0.4589],
        [0.2370, 0.3216, 0.4414],
        [0.2672, 0.3276, 0.4052],
        [0.2333, 0.3315, 0.4352],
        [0.2557, 0.3224, 0.4219],
        [0.2410, 0.3262, 0.4329],
        [0.2856, 0.2990, 0.4154],
        [0.2499, 0.3166, 0.4335],
        [0.2840, 0.3283, 0.3877],
        [0.2816, 0.3011, 0.4172],
        [0.2552, 0.3155, 0.4293],
        [0.284

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[0.0354, 0.9350, 0.0296],
        [0.0322, 0.9369, 0.0309],
        [0.0322, 0.9386, 0.0292],
        [0.0338, 0.9331, 0.0331],
        [0.0360, 0.9354, 0.0286],
        [0.0314, 0.9361, 0.0326],
        [0.0313, 0.9375, 0.0312],
        [0.0339, 0.9355, 0.0306],
        [0.0325, 0.9384, 0.0291],
        [0.0341, 0.9341, 0.0318],
        [0.0323, 0.9338, 0.0339],
        [0.0340, 0.9340, 0.0320],
        [0.0322, 0.9357, 0.0322],
        [0.0321, 0.9361, 0.0318],
        [0.0333, 0.9367, 0.0301],
        [0.0360, 0.9367, 0.0273],
        [0.0321, 0.9368, 0.0311],
        [0.0359, 0.9352, 0.0289],
        [0.0406, 0.9282, 0.0312],
        [0.0318, 0.9357, 0.0325],
        [0.0298, 0.9336, 0.0367],
        [0.0327, 0.9332, 0.0341],
        [0.0317, 0.9336, 0.0347],
        [0.0320, 0.9375, 0.0305],
        [0.0327, 0.9373, 0.0300],
        [0.0317, 0.9372, 0.0311],
        [0.0320, 0.9354, 0.0326],
        [0.0330, 0.9370, 0.0300],
        [0.0333, 0.9350, 0.0318],
        [0.032

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")




VBox(children=(Label(value='0.024 MB of 0.024 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁███████
train/accuracy_epoch,▁
train/accuracy_step,▁▃▅▅▅▆▇▆▇▇▇▇▇██▇▆█▇█▇▇▇▇▇█▇▇██████▇██▇██
train/f1_epoch,▁
train/f1_step,▁▃▅▅▅▆▇▆▇▇▇▇▇██▇▆█▇█▇▇▇▇▇█▇▇██████▇██▇██
train/loss,█▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▁▂▁▁▂▁▁▂▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▁▁▁▇▇▇▇▇▇██
val/accuracy_epoch,▁
val/accuracy_step,▁
val/f1_epoch,▁

0,1
epoch,1.0
train/accuracy_epoch,0.85382
train/accuracy_step,1.0
train/f1_epoch,0.85382
train/f1_step,1.0
train/loss,0.10217
trainer/global_step,41.0
val/accuracy_epoch,0.96591
val/accuracy_step,0.96591
val/f1_epoch,0.96591


**Results**

The model achieved a validation accuracy of 97.8% after training. We are concerned that it overfits on the question mark for classifying questions, which we need to investigate further.

# Model Persistence

## Persist model locally

For evaluation purposes, we want to persist the model locally.

In [34]:
model.save_pretrained('bert-classifier/model')
tokenizer.save_pretrained('bert-classifier/tokenizer')

('bert-classifier/tokenizer/tokenizer_config.json',
 'bert-classifier/tokenizer/special_tokens_map.json',
 'bert-classifier/tokenizer/vocab.txt',
 'bert-classifier/tokenizer/added_tokens.json')

In [35]:
# Load model locally
loaded_model = BertForSequenceClassification.from_pretrained('bert-classifier/model', num_labels=3)
loaded_tokenizer = BertTokenizer.from_pretrained('bert-classifier/tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Persist model on HuggingFace

Our BERT classifier for the Data Chatbot is published here: https://huggingface.co/nlpchallenges/Text-Classification/tree/main

Before pushing to the hub for the first time, make sure to run `huggingface-cli login` in your terminal, and paste the API token from your HuggingFace profile.

In [20]:
model.push_to_hub("nlpchallenges/Text-Classification")
tokenizer.push_to_hub("nlpchallenges/Text-Classification")

pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nlpchallenges/Text-Classification/commit/eb0bf2fb07564a5156f0f1c507e670a896368011', commit_message='Upload tokenizer', commit_description='', oid='eb0bf2fb07564a5156f0f1c507e670a896368011', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
loaded_tokenizer = BertTokenizer.from_pretrained('nlpchallenges/Text-Classification')
loaded_model = BertForSequenceClassification.from_pretrained('nlpchallenges/Text-Classification')

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

# Inference

In [22]:
# Inference
def classify_text(user_input, model, tokenizer, max_len=100):
    # Tokenize the user input
    inputs = tokenizer(
        user_input,
        None,
        add_special_tokens=True, # Add '[CLS]' and '[SEP]', default True
        max_length=max_len, # Maximum length to use by one of the truncation/padding parameters
        padding='max_length', # Pad to a maximum length specified with the argument max_length
        truncation=True, # Truncate to a maximum length specified with the argument max_length
    )

    ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0) # Indices of input sequence tokens in the vocabulary
    mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0) # Mask to avoid performing attention on padding token indices
    
    
    # Get model output
    model.eval()
    with torch.no_grad():
        output = model(ids, attention_mask=mask)
    
    # Get predicted label index
    _, predicted_idx = torch.max(output.logits, 1)
    
    # Map index to label
    label_mapping = {0: 'harm', 1: 'question', 2: 'concern'}
    return label_mapping[predicted_idx.item()]


In [25]:
classify_text("Kannst du diese Frage beantworten?", loaded_model, loaded_tokenizer, max_len=None)

'question'