In [55]:
!pip install -U accelerate
!pip install -U transformers
!pip install datasets



## Imports

In [56]:
import pandas as pd
from datasets import load_dataset

import re
from sklearn.preprocessing import MultiLabelBinarizer

import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from torch.utils.data import Dataset

from transformers import TrainingArguments, Trainer

import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction

## Loading the Dataset

In [57]:
dataset = load_dataset("lex_glue", "ecthr_b")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [58]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 1000
    })
})

## Preprocessing

Transforming the data such that each sample has a 'text' which is a string and 'labels' which is a list of integers.

In [59]:
def clean_text(data):
    # data['text'] is a list of strings
    concatenated_text = ' '.join(data['text'])
    cleaned_text = re.sub(r"[^a-zA-Z0-9]", " ", concatenated_text)
    cleaned_text = cleaned_text.lower()
    return {
        'text': cleaned_text,
        'labels': data['labels']
    }

In [60]:
dataset = dataset.map(clean_text)

In [61]:
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

## Label Encoder

Since we have labels as list of integers, we transform them using MultiLabelBinarizer, to get a binary list denoting whether the class is present or not.

In [62]:
multilabel = MultiLabelBinarizer()

train_labels = multilabel.fit_transform(train_df['labels']).astype('float32')
val_labels = multilabel.fit_transform(val_df['labels']).astype('float32')

train_texts = train_df['text'].tolist()
val_texts = val_df['text'].tolist()

In [63]:
train_labels[1]

array([0., 0., 0., 1., 0., 0., 0., 0., 1., 1.], dtype=float32)

## Model

* Due to compute limitations and time constraints, I have chosen to use DistilBERT with fine-tuning, since it is lighter version of BERT and provides a good balance between performance and resource usage.

In [64]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(train_labels[0]),
                                                            problem_type="multi_label_classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* Transforming the dataset for using it with `Trainer` library from `transformers` module

In [65]:
class DatasetForTrainer(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }

In [66]:
train_dataset = DatasetForTrainer(train_texts, train_labels, tokenizer)
val_dataset = DatasetForTrainer(val_texts, val_labels, tokenizer)

* Defining functions for evaluation to be used with `Trainer`

In [67]:
def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels

  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }

  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

* Training Arguments:

In [68]:
args = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir = './results',
    num_train_epochs=2,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

## Training

In [46]:
trainer.train()

Step,Training Loss
500,0.2842
1000,0.2333
1500,0.2146
2000,0.2023
2500,0.1844
3000,0.1677
3500,0.1686
4000,0.1705
4500,0.1579


TrainOutput(global_step=4500, training_loss=0.19817021009657118, metrics={'train_runtime': 1019.387, 'train_samples_per_second': 17.658, 'train_steps_per_second': 4.414, 'total_flos': 596188339200000.0, 'train_loss': 0.19817021009657118, 'epoch': 2.0})

## Evaluation

In [47]:
trainer.evaluate()

{'eval_loss': 0.23042698204517365,
 'eval_roc_auc': 0.7426915591606724,
 'eval_hamming_loss': 0.1008,
 'eval_f1': 0.5783277534725749,
 'eval_runtime': 48.6429,
 'eval_samples_per_second': 20.558,
 'eval_steps_per_second': 5.14,
 'epoch': 2.0}

## Saving the model weights and the tokenizer

In [48]:
trainer.save_model("distilbert-finetuned-ecthrb-multi-label")

In [49]:
import pickle
with open("multi-label-binarizer.pkl", "wb") as f:
  pickle.dump(multilabel, f)

In [50]:
!zip -r distilbert.zip "/content/distilbert-finetuned-ecthrb-multi-label"

  adding: content/distilbert-finetuned-ecthrb-multi-label/ (stored 0%)
  adding: content/distilbert-finetuned-ecthrb-multi-label/training_args.bin (deflated 51%)
  adding: content/distilbert-finetuned-ecthrb-multi-label/config.json (deflated 56%)
  adding: content/distilbert-finetuned-ecthrb-multi-label/model.safetensors (deflated 8%)
