# A full training

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

Inspect the dataset without downloading it:

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from datasets import load_dataset_builder

dataset_builder = load_dataset_builder("glue", "mrpc")

print(f"Complete info object: {dataset_builder.info}")
print(f"Description: {dataset_builder.info.description}")
print(f"Features: {dataset_builder.info.features}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Complete info object: DatasetInfo(description='', citation='', homepage='', license='', features={'sentence1': Value('string'), 'sentence2': Value('string'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), 'idx': Value('int32')}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='glue', config_name='mrpc', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=943843, num_examples=3668, shard_lengths=None, dataset_name=None), 'validation': SplitInfo(name='validation', num_bytes=105879, num_examples=408, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=442410, num_examples=1725, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1033400, post_processing_size=None, dataset_size=1492132, size_in_bytes=None)
Description: 
Features: {'sentence1': Value('string'), 'sentence2': Value('string'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), 'idx': Value('int32')}


Load the dataset:

In [3]:
from datasets import DatasetDict, load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets: DatasetDict = load_dataset("glue", "mrpc")

mrpc/train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

mrpc/validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

mrpc/test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [4]:
print("Data before tokenization:")
example_data = next(iter(raw_datasets["train"]))
print(raw_datasets["train"][0].keys())
print(f"Sentence1: {raw_datasets['train'][0]['sentence1']}")
print(f"Sentence2: {raw_datasets['train'][0]['sentence2']}")
print(f"Label: {raw_datasets['train'][0]['label']}")
print(f"Index: {raw_datasets['train'][0]['idx']}")

Data before tokenization:
dict_keys(['sentence1', 'sentence2', 'label', 'idx'])
Sentence1: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
Sentence2: Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
Label: 1
Index: 0


In [5]:
# Name of the pretrained model
checkpoint = "bert-base-uncased"

# Download pretrained tokenizer for the bert-base-uncased model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, padding=True)


# Tokenize the dataset in batches
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [6]:
print("Data after tokenization:")
print(tokenized_datasets["train"].column_names)

# The original sentences (untouched)
print(f"Sentence1: {tokenized_datasets['train'][0]['sentence1']}")
print(f"Sentence2: {tokenized_datasets['train'][0]['sentence2']}")

# Is paraphrase (1) or not (0)
print(f"Label: {tokenized_datasets['train'][0]['label']}")

# Just the ID of the dataset (not important for training)
print(f"Index: {tokenized_datasets['train'][0]['idx']}")

# ID of word at index in BERT vocabular (e.g. 102 is [SEP])
print(f"Input IDs: {tokenized_datasets['train'][0]['input_ids']}")

# Show what word belongs to sentence1 (0) and to sentence2 (1)
print(f"Token Type IDs: {tokenized_datasets['train'][0]['token_type_ids']}")

# Information about what index is word (1) and what is padding (0)
print(f"Attention Mask: {tokenized_datasets['train'][0]['attention_mask']}")

Data after tokenization:
['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']
Sentence1: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
Sentence2: Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
Label: 1
Index: 0
Input IDs: [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Token Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# Huggingface trainer API expects labels column
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# Return torch.tensors instead of lists
tokenized_datasets.set_format("torch")

print(tokenized_datasets["train"].column_names)

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [8]:
from torch.utils.data import DataLoader

# Create dataloader for training and validation
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=8,
    collate_fn=data_collator,
)

In [9]:
batch = next(iter(train_dataloader))
print({k: v.shape for k, v in batch.items()})

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 100]), 'token_type_ids': torch.Size([8, 100]), 'attention_mask': torch.Size([8, 100])}


In [10]:
from transformers import AutoModelForSequenceClassification

# Load pretrained BERT Model from Huggingface Hub and set classification head output to 2 classes
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Test if preprocessing was correctly and model can handel the data
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.5686, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [12]:
from torch.optim import AdamW

# Use adamW as optimizer (similar to Adam but fixed weight decay problem) (also default of Huggingface Trainer API)
optimizer = AdamW(model.parameters(), lr=5e-5)

In [13]:
from transformers import get_scheduler

num_epochs = 3

# 1 step = training 1 batch
num_training_steps = num_epochs * len(train_dataloader)

# Regulates how the learning rate is adjusted during training
lr_scheduler = get_scheduler(
    "linear",  # Learn rate decreases linearly during training
    optimizer=optimizer,  # Use previously defined AdamW optimizer
    num_warmup_steps=0,  # Increases learning rate in defined steps from 0 to lr (used so that initial weight updates are not too big and destroy the model)
    num_training_steps=num_training_steps,
)

print(num_training_steps)

1377


In [14]:
import torch


# Use fastest available device
def get_device() -> torch.device:
    if torch.cuda.is_available():
        return torch.device("cuda")
    if hasattr(torch, "mps") and torch.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")


device = get_device()
model.to(device)
device

device(type='cuda')

In [15]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for _ in range(num_epochs):
    for batch in train_dataloader:
        # Move all values of the batch to device (input_ids, attention_mask, token_type_ids, labels)
        batch_on_device = {k: v.to(device) for k, v in batch.items()}
        # Unpack data to hand over to model. Similar to this:
        # model(
        # input_ids=batch_on_device["input_ids"],  # noqa: ERA001
        # attention_mask=batch_on_device["attention_mask"],  # noqa: ERA001
        # token_type_ids=batch_on_device["token_type_ids"],  # noqa: ERA001
        # labels=batch_on_device["labels"])

        outputs = model(**batch_on_device)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

In [16]:
import evaluate

metric = evaluate.load("glue", "mrpc")
print(metric.info.inputs_description)

Downloading builder script: 0.00B [00:00, ?B/s]


Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = evaluate.load('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(results)
    {'accuracy': 1.0}

    >>> glue_metric = evaluate.load('glue', 'mrpc')  # 'mrpc' or 'qqp'
    >>> references = [0, 1]
    >>> predi

In [17]:
model.eval()
for batch in eval_dataloader:
    batch_on_device = {k: v.to(device) for k, v in batch.items()}

    # Disable gradient descent for evaluation
    with torch.no_grad():
        outputs = model(**batch_on_device)

    logits = outputs.logits
    # Predict label with highest probability
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch_on_device["labels"])

metric.compute()

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

In [18]:
def training_function():
  from accelerate import Accelerator
  from torch.optim import AdamW
  from transformers import AutoModelForSequenceClassification, get_scheduler
  from tqdm import tqdm

  # Use accelerator for faster distributed training on multiple GPUs/TPUs
  accelerator = Accelerator()

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
  optimizer = AdamW(model.parameters(), lr=3e-5)

  # Wraps all important parts of the training process
  train_dl, eval_dl, model, optimizer = accelerator.prepare(
      train_dataloader, eval_dataloader, model, optimizer,
  )

  # Similar to normal training
  num_epochs = 3
  num_training_steps = num_epochs * len(train_dl)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps,
  )

  progress_bar = tqdm(range(num_training_steps))

  model.train()
  for epoch in range(num_epochs):
      for batch in train_dl:
          # No to_device needed because accelerator takes care of this
          outputs = model(**batch)
          loss = outputs.loss
          accelerator.backward(loss) # Another difference

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

In [19]:
from accelerate import notebook_launcher

notebook_launcher(training_function)

Launching training on one GPU.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

  0%|          | 0/1377 [00:00<?, ?it/s][A
  0%|          | 1/1377 [00:00<02:38,  8.69it/s][A
  0%|          | 2/1377 [00:00<02:37,  8.72it/s][A
  0%|          | 3/1377 [00:00<02:59,  7.64it/s][A
  0%|          | 4/1377 [00:00<03:18,  6.91it/s][A
  0%|          | 5/1377 [00:00<03:27,  6.61it/s][A
  0%|          | 6/1377 [00:00<03:32,  6.44it/s][A
  1%|          | 7/1377 [00:01<03:41,  6.18it/s][A
  1%|          | 8/1377 [00:01<03:41,  6.17it/s][A
  1%|          | 9/1377 [00:01<03:41,  6.16it/s][A
  1%|          | 10/1377 [00:01<03:41,  6.16it/s][A
  1%|          | 11/1377 [00:01<03:43,  6.12it/s][A
  1%|          | 12/1377 [00:01<03:44,  6.09it/s][A
  1%|          | 13/1377 [00:02<03: