#DistilBERT base

In [None]:
#install transformers datasets evaluate
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from

In [None]:
#import the dataset
from datasets import load_dataset

dataset = load_dataset('csv', data_files={'train': 'df_train.csv',
                                          'val': 'df_valid.csv',
                                              'test': 'df_test.csv'})



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
#preprocess the data by tokenizing the text
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length',
                                max_length=128,
                                truncation=True,
                                return_tensors="pt")

In [None]:
#declare dictionaries to map the labels
id2label = {0: "phrase", 1: "passage", 2: "multi"}
label2id = {"phrase": 0, "passage": 1, "multi": 2}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", num_labels=3, id2label=id2label, label2id=label2id
).to("cuda")

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

In [None]:
#run the preprocess function on the input text
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2560 [00:00<?, ? examples/s]

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [None]:
#import the metrics
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):

  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
  accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]

  return {"f1": f1, "accuracy": accuracy}

In [None]:
#import the data collator of huggingface
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

###DeBERTa-large training



In [None]:
# empty the cache
import torch 
torch.cuda.empty_cache()

In [None]:
# declare the training arguments
training_args = TrainingArguments(
    output_dir="output5",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

#declare the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

#run the trainer
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.939682,0.505188,0.553125
2,No log,0.847531,0.586386,0.625
3,No log,0.854914,0.60252,0.642188
4,0.848900,0.904398,0.603775,0.632812
5,0.848900,0.938929,0.609635,0.642188


TrainOutput(global_step=800, training_loss=0.695194091796875, metrics={'train_runtime': 173.8211, 'train_samples_per_second': 73.639, 'train_steps_per_second': 4.602, 'total_flos': 423903235276800.0, 'train_loss': 0.695194091796875, 'epoch': 5.0})

###DeBERTa-large inference


In [None]:
# load the trained model
model_path = "output5/checkpoint-800"
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=3, id2label=id2label, label2id=label2id
).to("cuda")

# run the trainer and predict the outputs
test_trainer = Trainer(model) 

raw_pred, labels, metrics = test_trainer.predict(tokenized_data["test"]) 

y_pred = np.argmax(raw_pred, axis=1)

In [None]:
# compute f1 score of the model
test_f1 = f1_metric.compute(predictions=y_pred, references=labels, average="macro")["f1"]
test_f1

0.6136566019433247

In [None]:
# compute accuracy of the model

test_accuracy = accuracy_metric.compute(predictions=y_pred, references=labels)["accuracy"]
test_accuracy

0.62625

References:

https://huggingface.co/distilbert-base-uncased

https://huggingface.co/docs/transformers/tasks/sequence_classification


> @article{Sanh2019DistilBERTAD,\
  title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter},\
  author={Victor Sanh and Lysandre Debut and Julien Chaumond and Thomas Wolf},\
  journal={ArXiv},\
  year={2019},\
  volume={abs/1910.01108}
}




