## Intro
Now we will fine-tune a model based on the MRPC dataset.

## Load Dataset

In [1]:
!pip install datasets

from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

## Tokenizer

In [2]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

## Training

In [6]:
!pip install evaluate
import evaluate
import numpy as np
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

# Training arguments and config (only the saving destination for now)
training_args = TrainingArguments("test-trainer", report_to="none", evaluation_strategy="epoch")

# Load model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Evaluation metrics
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Let training commence!
trainer.train()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.382747,0.838235,0.887372
2,0.528400,0.517034,0.855392,0.898799
3,0.291500,0.63515,0.865196,0.906303


TrainOutput(global_step=1377, training_loss=0.3427144218893612, metrics={'train_runtime': 225.0496, 'train_samples_per_second': 48.896, 'train_steps_per_second': 6.119, 'total_flos': 405114969714960.0, 'train_loss': 0.3427144218893612, 'epoch': 3.0})

## Predict

In [25]:
from torch.nn.functional import softmax

model.save_pretrained("test-trainer1")
tokenizer.save_pretrained("test-trainer1")

sentence1 = "This man went to the shop."
sentence2 = "The man visited the store today."

output_dir = "test-trainer1"

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForSequenceClassification.from_pretrained(output_dir)

inputs = tokenizer(sentence1, sentence2, return_tensors="pt")
outputs = model(**inputs)
print(softmax(outputs.logits, dim=-1))

tensor([[0.0077, 0.9923]], grad_fn=<SoftmaxBackward0>)


In [26]:
sentence1 = "His pet went missing."
sentence2 = "His love of pets endures until this day."

inputs = tokenizer(sentence1, sentence2, return_tensors="pt")
outputs = model(**inputs)
print(softmax(outputs.logits, dim=-1))

tensor([[0.9980, 0.0020]], grad_fn=<SoftmaxBackward0>)
