<a href="https://colab.research.google.com/github/JoyeBright/transformers-course/blob/main/NINE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Writing our training loop in PyTorch

In [30]:
!pip install datasets evaluate transformers[sentencespice]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [32]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [33]:
raw_datasets  = load_dataset("glue", "mrpc")



  0%|          | 0/3 [00:00<?, ?it/s]

In [34]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [35]:
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [36]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [37]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]



In [38]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [39]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [40]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [41]:
# We need to remove the columns corresponding to values the model does not expect (sentence1 and sentences2)

In [42]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])

In [43]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [44]:
# Rename the column label to labels (because the model expects the argument to be named labels).

In [45]:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [46]:
# Set the format of the datasets so they return PyTorch tensors instead of lists.

In [47]:
tokenized_datasets.set_format("torch")

In [48]:
tokenized_datasets['train']['labels'][3660:]

tensor([1, 1, 1, 1, 0, 1, 1, 0])

In [49]:
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [50]:
#our model only accept: ["attention_mask", "input_ids", "labels", "token_type_ids"]

###Now that this is done, we can easily define our dataloaders:

In [51]:
from torch.utils.data import DataLoader

In [52]:
tokenized_datasets["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [53]:
trainer_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

In [54]:
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [55]:
# inspect a batch to check if there is no mistake

In [56]:
for batch in trainer_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 88]),
 'token_type_ids': torch.Size([8, 88]),
 'attention_mask': torch.Size([8, 88])}

In [57]:
# Data pre-processing finished, let's turn to the model.

In [58]:
from transformers import AutoModelForSequenceClassification

In [59]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [61]:
outputs = model(**batch)

In [65]:
# All 🤗 Transformers models will return the loss when labels are provided, and we also get the logits (two for each input in our batch, so a tensor of size 8 x 2).

In [64]:
print(outputs.loss, outputs.logits.shape)

tensor(1.0055, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [66]:
# for our training loop we would need two more things: an optimizer and a learning rate scheduler

In [71]:
from torch.optim import AdamW

In [72]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [73]:
#  a linear decay from the maximum value (5e-5) to 0

In [78]:
# to define this we need to know the number of training steps we will take, which is the number of epochs we want to run multiplied by the number of training batches – the length of our training dataloader

In [75]:
from transformers import get_scheduler

In [81]:
num_epochs = 3
num_training_steps = num_epochs * len(trainer_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)
print(num_training_steps)

1377


In [82]:
# using GPU

In [83]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [84]:
# Ready to train models ...

In [86]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in trainer_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

In [87]:
#evaluation loop

In [88]:
import evaluate

In [90]:
metric = evaluate.load("glue", "mrpc")
model.eval()

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8627450980392157, 'f1': 0.9027777777777778}