In [None]:
!pip install transformers



In [None]:
from transformers import pipeline
bert_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)

In [None]:
query = """Hey, I'd like to rent a vehicle from Nov 1st to Nov 15th in
Paris and I need a 15 passenger van"""
pipe(query)

[{'label': 'car_rental', 'score': 0.5490034222602844}]

In [None]:
import torch
from pathlib import Path
from time import perf_counter
import numpy as np

In [None]:
class PerformanceBenchmark:
  def __init__(self, pipeline, dataset, optim_type="BERT baseline"):
    self.pipeline = pipeline
    self.dataset = dataset
    self.optim_type = optim_type

  def compute_accuracy(self):
    preds, labels = [], []
    for example in self.dataset:
      pred = self.pipeline(example["text"])[0]["label"]
      label = example["intent"]
      preds.append(intents.str2int(pred))
      labels.append(label)
    accuracy = accuracy_score.compute(predictions=preds, references=labels)
    print(f"Accuracy on test set - {accuracy['accuracy']:.3f}")
    return accuracy

  def compute_size(self):
    state_dict = self.pipeline.model.state_dict()
    tmp_path = Path("model.pt")
    torch.save(state_dict, tmp_path)
    # Calculate size in megabytes
    size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
    # Delete temporary file
    tmp_path.unlink()
    print(f"Model size (MB) - {size_mb:.2f}")
    return {"size_mb": size_mb}

  def time_pipeline(self):
    latencies = []
    # Warmup
    for _ in range(10):
      _ = self.pipeline(query)
    # Timed run
    for _ in range(100):
      start_time = perf_counter()
      _ = self.pipeline(query)
      latency = perf_counter() - start_time
      latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
    return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

  def run_benchmark(self):
    metrics = {}
    metrics[self.optim_type] = self.compute_size()
    metrics[self.optim_type].update(self.time_pipeline())
    metrics[self.optim_type].update(self.compute_accuracy())
    return metrics

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset
clinc = load_dataset("clinc_oos", "plus")

In [None]:
sample = clinc["test"][42]
sample

{'text': 'transfer $100 from my checking to saving account', 'intent': 133}

In [None]:
intents = clinc["test"].features["intent"]
intents.int2str(sample["intent"])

'transfer'

In [None]:
from datasets import load_metric
accuracy_score = load_metric("accuracy")

  accuracy_score = load_metric("accuracy")


In [None]:
list(pipe.model.state_dict().items())[42]

('bert.encoder.layer.2.attention.self.value.bias',
 tensor([-2.7834e-02,  4.9434e-02,  8.3551e-02,  4.1092e-02,  6.0157e-01,
          1.1774e-01, -5.2112e-02, -6.5143e-02, -2.9358e-02, -4.2250e-02,
          7.9177e-02,  8.0409e-02,  2.9921e-03,  1.7816e-01, -5.0480e-02,
         -1.5634e-01, -2.1707e-02,  1.4381e-02,  2.5132e-02, -2.4110e-02,
         -1.9183e-01, -7.8657e-02,  5.0709e-02,  3.3632e-02, -3.1946e-02,
          1.1616e-01,  9.2720e-02, -1.1787e-01,  2.3233e-01, -1.2678e-02,
         -1.3138e-01, -4.0024e-02,  7.4823e-02, -5.4148e-02, -1.5184e-01,
         -7.4407e-02,  1.1559e-01,  8.2729e-02, -1.3787e-01,  8.3528e-02,
          1.2154e-01,  1.6880e-02, -5.6629e-02, -3.9295e-02,  5.3725e-02,
          6.8602e-02, -1.1294e-01,  4.4001e-02, -2.5884e-01,  1.6767e-01,
          1.8316e-01,  5.6272e-02, -3.6874e-02, -2.7938e-02, -9.3204e-02,
         -7.5239e-03,  4.1141e-02, -1.1542e-02, -9.9749e-02, -3.0910e-02,
          4.1398e-02, -4.4389e-02, -2.6279e-02,  7.2100e-02, 

In [None]:
pb = PerformanceBenchmark(pipe, clinc["test"])
perf_metrics = pb.run_benchmark()

Model size (MB) - 418.15
Average latency (ms) - 216.50 +\- 96.05
Accuracy on test set - 0.867


##Teacher-Student Model

In [None]:
from transformers import TrainingArguments
class DistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    super().__init__(*args, **kwargs)
    self.alpha = alpha
    self.temperature = temperature

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer
class DistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model
  def compute_loss(self, model, inputs, return_outputs=False):
    outputs_stu = model(**inputs)
    # Extract cross-entropy loss and logits from student
    loss_ce = outputs_stu.loss
    logits_stu = outputs_stu.logits
    # Extract logits from teacher
    with torch.no_grad():
      outputs_tea = self.teacher_model(**inputs)
      logits_tea = outputs_tea.logits
    # Soften probabilities and compute distillation loss
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    loss_kd = self.args.temperature ** 2 * loss_fct(
        F.log_softmax(logits_stu / self.args.temperature, dim=-1),
        F.softmax(logits_tea / self.args.temperature, dim=-1))
    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_stu) if return_outputs else loss

In [None]:
from transformers import AutoTokenizer
student_ckpt = "distilbert-base-uncased"
student_tokenizer = AutoTokenizer.from_pretrained(student_ckpt)
def tokenize_text(batch):
  return student_tokenizer(batch["text"], truncation=True)
clinc_enc = clinc.map(tokenize_text, batched=True, remove_columns=["text"])
clinc_enc = clinc_enc.rename_column("intent", "labels")

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

In [None]:
!pip install accelerate -U



In [None]:
batch_size = 48
finetuned_ckpt = "distilbert-base-uncased-finetuned-clinc"
student_training_args = DistillationTrainingArguments(
    output_dir=finetuned_ckpt, evaluation_strategy = "epoch",
    num_train_epochs=5, learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1, weight_decay=0.01,
    push_to_hub=False)

In [None]:
id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

In [None]:
from transformers import AutoConfig
num_labels = intents.num_classes
student_config = (AutoConfig
                  .from_pretrained(student_ckpt, num_labels=num_labels,
                                   id2label=id2label, label2id=label2id))

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def student_init():
  return (AutoModelForSequenceClassification
          .from_pretrained(student_ckpt, config=student_config).to(device))

In [None]:
teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher_model = (AutoModelForSequenceClassification
                 .from_pretrained(teacher_ckpt, num_labels=num_labels)
                 .to(device))

In [None]:
distilbert_trainer = DistillationTrainer(model_init=student_init,
                                         teacher_model=teacher_model,
                                         args=student_training_args,
                                         train_dataset=clinc_enc['train'],
                                         eval_dataset=clinc_enc['validation'],
                                         compute_metrics=compute_metrics,
                                         tokenizer=student_tokenizer)
distilbert_trainer.train()

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.276314,0.728387
2,3.782500,1.862485,0.836452
3,3.782500,1.15131,0.898387
4,1.685900,0.854017,0.913548
5,0.898400,0.772481,0.916452


TrainOutput(global_step=1590, training_loss=2.0453168521137357, metrics={'train_runtime': 280.4184, 'train_samples_per_second': 271.915, 'train_steps_per_second': 5.67, 'total_flos': 413013830824140.0, 'train_loss': 2.0453168521137357, 'epoch': 5.0})

##Finding good hyperparameters

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [25]:
import optuna

def hp_space(trial):
  return {"num_train_epochs": trial.suggest_int("num_train_epochs", 5, 10),
          "alpha": trial.suggest_float("alpha", 0, 1),
          "temperature": trial.suggest_int("temperature", 2, 20)}

best_run = distilbert_trainer.hyperparameter_search(
    n_trials=5, direction="maximize", hp_space=hp_space)

[I 2023-11-01 14:03:00,322] A new study created in memory with name: no-name-88a94e2a-448e-4444-a88f-3db3ee1941fe
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-unc

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.197273,0.576452
2,0.312500,0.098732,0.829677
3,0.312500,0.067657,0.883548
4,0.113400,0.053121,0.898065
5,0.075300,0.045708,0.909677
6,0.075300,0.042083,0.911613
7,0.062400,0.04062,0.912903


[I 2023-11-01 14:09:42,731] Trial 0 finished with value: 0.9129032258064517 and parameters: {'num_train_epochs': 7, 'alpha': 0.8538964273872496, 'temperature': 18}. Best is trial 0 with value: 0.9129032258064517.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Distil

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.26131,0.633226
2,0.406500,0.121596,0.83
3,0.406500,0.078098,0.884839
4,0.140300,0.059943,0.899032
5,0.089100,0.052137,0.90871
6,0.089100,0.049537,0.909677


[I 2023-11-01 14:15:26,461] Trial 1 finished with value: 0.9096774193548387 and parameters: {'num_train_epochs': 6, 'alpha': 0.7950535033597458, 'temperature': 4}. Best is trial 0 with value: 0.9129032258064517.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilB

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.204677,0.598065
2,0.326100,0.09847,0.84129
3,0.326100,0.065167,0.893548
4,0.113400,0.049098,0.902903
5,0.071700,0.040793,0.914516
6,0.071700,0.035848,0.916774
7,0.056500,0.032822,0.918387
8,0.049400,0.031226,0.921613
9,0.049400,0.03062,0.92


[I 2023-11-01 14:24:05,392] Trial 2 finished with value: 0.92 and parameters: {'num_train_epochs': 9, 'alpha': 0.5168671040931857, 'temperature': 10}. Best is trial 2 with value: 0.92.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassificatio

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.200267,0.587097
2,0.318500,0.098234,0.837097
3,0.318500,0.066124,0.889032
4,0.113000,0.050764,0.900968
5,0.073000,0.042783,0.912581
6,0.073000,0.038308,0.913871
7,0.058800,0.03569,0.915161
8,0.052800,0.03479,0.915484


[I 2023-11-01 14:32:12,203] Trial 3 finished with value: 0.915483870967742 and parameters: {'num_train_epochs': 8, 'alpha': 0.2680931095389487, 'temperature': 13}. Best is trial 2 with value: 0.92.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequence

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.193742,0.584194
2,0.309500,0.094732,0.834194
3,0.309500,0.063827,0.89129
4,0.109000,0.048616,0.901613
5,0.069900,0.040654,0.913226
6,0.069900,0.035858,0.916774


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.193742,0.584194
2,0.309500,0.094732,0.834194
3,0.309500,0.063827,0.89129
4,0.109000,0.048616,0.901613
5,0.069900,0.040654,0.913226
6,0.069900,0.035858,0.916774
7,0.055500,0.032926,0.916774
8,0.048700,0.031333,0.919677
9,0.048700,0.030737,0.919677


[I 2023-11-01 14:41:22,458] Trial 4 finished with value: 0.9196774193548387 and parameters: {'num_train_epochs': 9, 'alpha': 0.30490462195943946, 'temperature': 19}. Best is trial 2 with value: 0.92.


In [26]:
print(best_run)

BestRun(run_id='2', objective=0.92, hyperparameters={'num_train_epochs': 9, 'alpha': 0.5168671040931857, 'temperature': 10}, run_summary=None)


In [27]:
for k,v in best_run.hyperparameters.items():
  setattr(student_training_args, k, v)

# Define a new repository to store our distilled model
distilled_ckpt = "distilbert-base-uncased-distilled-clinc"
student_training_args.output_dir = distilled_ckpt

# Create a new Trainer with optimal parameters
distil_trainer = DistillationTrainer(
    model_init=student_init,
    teacher_model=teacher_model, args=student_training_args,
    train_dataset=clinc_enc['train'], eval_dataset=clinc_enc['validation'],
    compute_metrics=compute_metrics, tokenizer=student_tokenizer)

distil_trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.we

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.778902,0.739355
2,2.101600,0.945662,0.868065
3,2.101600,0.549973,0.918387
4,0.848200,0.378855,0.93871
5,0.384900,0.299396,0.941935
6,0.384900,0.262399,0.945484
7,0.236200,0.246662,0.945806
8,0.183800,0.235547,0.945806
9,0.183800,0.233583,0.945806


TrainOutput(global_step=2862, training_loss=0.6771755138533171, metrics={'train_runtime': 556.7262, 'train_samples_per_second': 246.531, 'train_steps_per_second': 5.141, 'total_flos': 743498049913920.0, 'train_loss': 0.6771755138533171, 'epoch': 9.0})

##Quantization

In [28]:
from torch.quantization import quantize_dynamic
model_ckpt = "transformersbook/distilbert-base-uncased-distilled-clinc"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = (AutoModelForSequenceClassification
.from_pretrained(model_ckpt).to("cpu"))
model_quantized = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/8.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [29]:
distilled_ckpt = "transformersbook/distilbert-base-uncased-distilled-clinc"
pipe = pipeline("text-classification", model=distilled_ckpt)
optim_type = "Distillation"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type=optim_type)
perf_metrics.update(pb.run_benchmark())

Model size (MB) - 255.88
Average latency (ms) - 67.81 +\- 15.51
Accuracy on test set - 0.868


In [30]:
# plot_metrics(perf_metrics, optim_type)

In [31]:
pipe = pipeline("text-classification", model=model_quantized,
tokenizer=tokenizer)
optim_type = "Distillation + quantization"
pb = PerformanceBenchmark(pipe, clinc["test"], optim_type=optim_type)
perf_metrics.update(pb.run_benchmark())

Model size (MB) - 132.39
Average latency (ms) - 28.39 +\- 1.64
Accuracy on test set - 0.876
