# Finetune a DistilBERT model on the WNUT 17 dataset

In [None]:
!pip install -qqq transformers datasets evaluate seqeval accelerate bitsandbytes loralib peft

# Load WNUT17 Dataset from the Datasets library

In [None]:
from datasets import load_dataset

wnut = load_dataset("wnut_17")

print(wnut)

In [None]:
# Look at the data
print(wnut['train'][0])

In [None]:
# Each number in ner_tags column represents an entity. We can convert the numbers to names to get labels
label_list = wnut["train"].features["ner_tags"].feature.names

label_list

The letter that prefixes each ner_tag indicates the token position of the entity:

B- indicates the beginning of an entity.
I- indicates a token is contained inside the same entity (for example, the State token is a part of an entity like Empire State Building).
0 indicates the token doesn’t correspond to any entity.

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Look at an example instance
import pprint
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

pprint.pprint(example)
pprint.pprint( example['tokens'])

pprint.pprint(tokenized_input)
pprint.pprint(tokenized_input['input_ids'])
pprint.pprint(tokens)

# Make a PEFT Model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from peft import get_peft_model, LoraConfig, TaskType


# Create a config corresponding to the PEFT method
peft_config = LoraConfig(
    task_type = TaskType.TOKEN_CLS,
    target_modules= ["classifier"],
    inference_mode = False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

In [None]:
# Before training, we have to create a map of expected ids to labels with id2label and label2id
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [None]:
# Wrap base model
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(id2label), id2label= id2label, label2id = label2id)
model = get_peft_model(model, peft_config)



In [None]:
print(model)

In [None]:
model.print_trainable_parameters()

# Preprocessing

We want to preprocess this data which are tweets. The operations for preprocessing are:

1. Mapping all tokens to their corresponding word with the `word_ids` method

2. Assigning the label `-100` to the special tokens `[CLS]` and `[SEP]` so they're ignored by the PyTorch loss function

3. Only label the first token of a given word. Assign `-100` to other subtokens from the **same** word

In [None]:
# Define preprocess function
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

  labels = []
  for i, label in enumerate(examples[f"ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word_idx:
        # Only label the first token of a given word
        label_ids.append(label[word_idx])
      else:
        label_ids.append(-100)

      previous_word_idx = word_idx
    labels.append(label_ids)


  tokenized_inputs["labels"] = labels

  return tokenized_inputs

In [None]:
# Apply preprocessing to every instance in the dataset
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched = True,)

In [None]:
# Create a batch of examples, with dynamic padding. Use the appropriate collator function
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

# Evaluate
We require monitoring a metric during training to see how well our model is doing. We use the evaluate library to load an evaluation metric - we use the seqeval metric. seqeval produces precision, recall, F1 score and accuracy.

With the loss metric defined, we must define a function that takes model predictions and labels and computes the loss metric. This is usually called the compute_metrics function.

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
import numpy as np

#labels = [label_list[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis = 2)

  true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  results = seqeval.compute(predictions=true_predictions, references=true_labels)

  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      'f1': results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }

# Train using the Trainer API

The main training steps are:

1. Define training hyperparameters using a model specific `TrainingArguments` function. At the end of each epoch, the Trainer will evaluate the defined loss metric and save the training checkpoint.

2. Pass the training arguments to a Trainer function alongside the **model**, **dataset**, **tokenizer**, **data collator** and **compute metrics**

3. Call train() to finetune the model

In [None]:
from transformers import TrainingArguments, Trainer


In [None]:
training_args = TrainingArguments(
    output_dir="token_classification_wnut",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.save_model("token_classification_wnut_model")

# Inference



In [None]:
from peft import PeftModel, PeftConfig

peft_model_id = "token_classification_wnut_model"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(id2label), id2label= id2label, label2id = label2id)
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
device = "cuda"
model = model.to(device)
model.eval()

In [None]:
text = "Michael Faraday was a scientist who lived in England in the 19th century."

In [None]:
# Raw Pytorch approach
from transformers import AutoTokenizer
import torch

## Tokenize inputs
tokenizer = AutoTokenizer.from_pretrained("token_classification_wnut_model")
inputs = tokenizer(text, return_tensors="pt")

## Feed inputs to the model and return logits
with torch.no_grad():
  logits = model(**inputs.to(device)).logits

In [None]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class