## Fine Tune Pretrained Model With `Trainer`


#### Preparing the foundations

In [None]:
# Loading the tokenizer, and the dataset for the training.
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
dataset = load_dataset("glue", "mrpc")

In [None]:
# Preprocessing the data
def tokenize(example):
  """Responsible for tokenizing the dataset."""
  return tokenizer(example["sentence1"], example["sentence1"], truncation=True)

tokenized_dataset = dataset.map(function=tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Prepare the Hyperparameters for training and evaluation
from transformers import TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments("test-trainer-1")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

#### Training

In [None]:
from transformers import Trainer

trainer = Trainer(model,
                  training_args,
                  train_dataset=tokenized_dataset["train"],
                  eval_dataset=tokenized_dataset["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer)
# Start Training
trainer.train()

#### Evaluation

In [None]:
import numpy as np
import evaluate

predictions = trainer.predict(tokenized_dataset["validation"])
# predictions.predictions: logits
preds = np.argmax(predictions.predictions, axis=-1)

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
# In other way
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments("test-trainer-2", evaluation_strategy="epoch")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(model,
                  training_args,
                  train_dataset=tokenized_dataset["train"],
                  eval_dataset=tokenized_dataset["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)
# Start Training
trainer.train()