In [1]:
!pip install -q --upgrade transformers torch torchvision torchaudio
!pip install -q tokenizers==0.13.3 evaluate
!pip install -q bitsandbytes transformers accelerate gradio thread6

[0m

In [2]:
# Everything we've done in the last few chapters
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation = True)

# The primary purpose of the 'map()' function is to apply a given function (in this case, 'tokenize_function') to each element in the dataset.
# The 'batched=True' argument, indicates that the 'tokenize_function' should be applied to batches of examples rather that individual examples. This can be more efficient, especially when tokenizing text ->
# -> because many tokenizers (including huggingfaces) can process multiple sequences at once more quickly than processing each sequence individually.
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)

# 'data_collator' is a function or collable that processes a batch of data and prepares it for input into a model during training or evaluation.
# In this context the 'DataCollatorWithPadding' is a specific type of data collator designedto handle tokenized text data
# It pads sequences in each batch to the length of the longest sequence in that batch. Shorter sequences are padded with the appropriate padding token to ensure consistent input sizes.
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

### Truncation: The purpose of truncation is to ensure that sequences do not exceed a certain length, typically the maximum length that a model can handle. If a sequence is longer than this maximum, it gets cut off (or truncated) to fit. This is especially important for models like BERT, which have a fixed maximum input size (e.g., 512 tokens).
### Padding: Padding deals with sequences that are shorter than the maximum length or the longest sequence in a batch. In order to process a batch of sequences simultaneously, all sequences in that batch need to have the same length. Padding adds extra tokens (usually zeros or a special padding token) to the end of shorter sequences to ensure that all sequences in the batch have the same length.

#### Before we write our training loop, we will need to define a few objects.
#### The first ones will be the dataloaders we will use to iterate over batches.
   * Before we do this we'll need to apply a bit of postprocessing to our tokenized_datasets to take care of some things that the 'Trainer' did for us automatically, like:
       * Remove columns corresponding to values the model does not expect (like the sentence1 and sentence2 columns)
       * Rename the column 'label' to 'labels' (because the model expects the argument to be named 'labels')
       * set the format of the datasets so they return PyTorch tensors instead of lists

In [9]:
# Our 'tokenized_datasets' has one method for each of those steps
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

## Post-processing
   * Reasoning behind steps in the above code
   
### Tokenized Versions Replace Original Text
   * Once you've tokenized sentence1 and sentence2, you've essentially converted them into a format that the model can understand and process (i.e., sequences of token IDs). These tokenized versions are what the model will actually use during training. The original text columns (sentence1 and sentence2) are no longer needed because the model doesn't operate directly on raw text.
   
* While sentence1 and sentence2 was the data we initially used, by the time we're ready to fine-tune, they've been transformed into a format suitable for the model (token IDs). The original text columns are then superfluous and can be safely dropped to streamline the dataset.

####  The original sentence1 and sentence2 text data are transformed into several new columns that are suitable for input into transformer models.
   * input_ids: This column contains the tokenized version of your text. Each text sequence is converted into a sequence of token IDs based on the tokenizer's vocabulary. This is the primary input to the model.
   * token_type_ids: For models like BERT that can handle pair-wise sentence tasks (e.g., question-answering, sentence-pair classification), this column indicates which tokens belong to sentence1 and which belong to sentence2. Typically, tokens from sentence1 might be marked with 0 and tokens from sentence2 with 1. This helps the model distinguish between the two sentences when they are concatenated together.
   * attention_mask: This column indicates which tokens are actual content versus which ones are padding. A value of 1 typically indicates a real token, while a value of 0 indicates a padding token. The attention mask ensures that the model doesn't pay attention to padding tokens during training or inference.
   * labels: This column contains the labels for your training data, which the model will use as the "ground truth" during supervised training. It was renamed from label to labels in your code.

### Now that the above is done, we can define our "DataLoaders"

In [11]:
from torch.utils.data import DataLoader

# Training dataloader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle = True, batch_size = 8, collate_fn = data_collator
)

# Evaluation dataloader
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size = 8, collate_fn = data_collator
)

### Data Loaders

- A DataLoader is a utility provided by PyTorch in the torch.utils.data module. It's used to efficiently load and batch your data during training and evaluation. Here's why they're important:
    - **Batching**: Neural networks are typically trained using batches of data rather than one sample at a time. Batching allows for more efficient and parallelized processing, especially on GPUs. DataLoader automates the process of fetching batches of data.
    - **Shuffling**: For training, it's often beneficial to shuffle the data to ensure that the model doesn't learn any unintended patterns from the order of the data. Shuffling can help improve model generalization.
    - **Parallel Loading**: DataLoader can use multiple worker processes to load data in parallel, which can significantly speed up data loading, especially when the data loading process is I/O bound.
    - **Memory Efficiency**: Instead of loading the entire dataset into memory, DataLoader loads data on-the-fly in batches, which is more memory-efficient, especially for large datasets.
    
### The Above code

```python
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

```
* **tokenized_datasets["train"]**: This is the training portion of your tokenized dataset.
* **shuffle=True**: This ensures that the training data is shuffled before batching.
* **batch_size=8**: This specifies that each batch should contain 8 samples.
* **collate_fn=data_collator**: The collate_fn is a function that takes a list of samples and merges them into a batch. Here, you're using the data_collator you defined earlier (which handles padding) as the collate function.

### How the DataLoaders interacts with our data
- The tokenized_datasets you created earlier contains the tokenized versions of your data, ready to be fed into a model. However, to efficiently feed this data into a model during training and evaluation, you need to batch it, possibly shuffle it, and handle any last-minute processing (like padding). That's where the DataLoader comes in.
- The DataLoader will fetch batches of data from tokenized_datasets, and for each batch, it will use the data_collator to ensure that the sequences in the batch are padded appropriately.

### To quickly check for any mistakes in the data processing, we can inspect a batch like this:

In [14]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

# Note: the actual shape will differ since we set 'shuffle=True'

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 72]),
 'token_type_ids': torch.Size([8, 72]),
 'attention_mask': torch.Size([8, 72])}

In [15]:
# We are now finished with the data preprocessing step
# Now we instantiate the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# To make sure everything will go smoothly during training, we pass our batch to this model
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7956, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


### The above code
 - This code snippet is essentially a "sanity check" to ensure that everything is set up correctly before proceeding to full-scale training. By forwarding a batch through the model and examining the outputs:
```python
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)
```
- **model(*\*batch)**:
    * This line is forwarding a batch of data through the model. The **batch syntax is Python's way of unpacking a dictionary into keyword arguments. In this context, the batch is likely a dictionary containing the tokenized inputs for the model, such as ***input_ids, attention_mask, and possibly token_type_ids and labels.***
    * The model, which is an instance of AutoModelForSequenceClassification, is designed for tasks like sentence pair classification. When you forward data through it, it returns a structure containing various outputs.

- **outputs.loss**:
    * This is the model's computed loss for the given batch. If the batch contains labels, the model will compute the loss by comparing its predictions (logits) to the ground truth labels. This loss is what you'd typically optimize during training using an optimizer.
    
- **outputs.logits.shape**:
    * logits are the raw, unnormalized scores output by the model for each class. For a binary classification task (since num_labels=2), there will be two logits for each input example: one for each class.
    * By printing the shape of the logits, you're likely checking the dimensions of the output to ensure they match your expectations. For a batch size of 8 and 2 classes, you'd expect the shape to be [8, 2].
    
### Output Explanation:

*tensor(0.7956, grad_fn=<NllLossBackward0>) torch.Size([8, 2])*
* **tensor(0.7956, grad_fn=<NllLossBackward0>)**: This is the loss value for the batch. It's a single scalar value, and as mentioned, the grad_fn part indicates that this tensor is ready for gradient computations during backpropagation.
* **torch.Size([8, 2])**: This indicates that the logits tensor has a shape of 8x2. As explained, this means you have 8 samples in the batch and 2 scores (logits) for each sample.

In [18]:
# were almost ready to write our training loop! We're just missing the optimizer and the learning rate schedular.
# Since we are trying to replicate what the 'Trainer' was doing by hand, we will use the same defaults.
# The optimizer used by the Trainer is 'AdamW', which is the same as Adam, but with a twist for weight decay regularization.
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



### Weight Decay Regularization
- Weight decay is a form of regularization used in neural network training. Regularization is a technique to prevent overfitting by adding some form of penalty to the loss function. Overfitting occurs when a model performs very well on the training data but poorly on unseen data, indicating that it has become too complex and has started to memorize the training data rather than generalizing from it.

- The idea behind weight decay is to add a penalty to the loss that is proportional to the size of the model's weights. By doing this, the training process is discouraged from setting large values to the weights, leading to a simpler and more regularized model.

$$
L_{\text{new}} = L + \frac{\lambda}{2} \sum_{i} w_i^2
$$


* $L$ - is the original loss.

* $λ$ - is the weight decay coefficient, determining the strength of the regularization.

* $w_i$ - are the model's weights.

#### The AdamW optimizer is a variant of the Adam optimizer that correctly implements weight decay regularization, as opposed to the "decoupled weight decay" in the original Adam. This makes AdamW particularly suitable for tasks where weight decay has been found beneficial, such as fine-tuning transformer models.

In [26]:
# Finally, the learning rate schedular used by default is just 'linear decay' from the maximum value (5e-5) to 0. To properly define it, we need to know the number of training steps we will take.
# This is the number of 'Epochs' we want to run multiplied by the number of training batches (which is length of our training dataloader) The 'Trainer' uses 3 epochs by default so we'll use it as well

from transformers import get_scheduler

# An "epoch" refers to one complete pass through the entire training dataset. If you set num_epochs to 3, it means you intend to train the model on the entire training dataset three times.
num_epochs = 3

# By multiplying the number of epochs by the number of batches, you get the total number of training steps. A "training step" refers to a single update of the model's weights, which happens once per batch.
num_training_steps = num_epochs * len(train_dataloader) # This gives you the number of batches in your training dataset. If you have, for example, 1000 training samples and a batch size of 8, you'd have 125 batches.

# The learning rate scheduler adjusts the learning rate during training
lr_scheduler = get_scheduler(
    "linear", # The "linear" scheduler gradually decreases the learning rate from the initial value to zero.
    optimizer = optimizer, # The optimizer 'AdamW' made this in the last code cell
    num_warmup_steps = 0, # This specifies how many steps to linearly increase the learning rate before starting to decrease it. Setting it to 0 means there's no warm-up phase, and the learning rate will start decreasing from the beginning.
    num_training_steps = num_training_steps, # This is the total number of training steps you calculated earlier. It tells the scheduler over how many steps to decrease the learning rate to zero.
)

print(num_training_steps)

1377


### The Training Loop
- one last thing, we want to use the GPU if have access to one(CPU will take hours-days vs a GPU minutes-hours, depending on data) 
- To do this we define a device

In [22]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [27]:
# We will add a progress bar over our number of training steps, using the 'tqdm' library
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
# We didn't put any reporting so this training loop will not tell us anything about how the model fares. We need an evaluation loop for that

  0%|          | 0/1377 [00:00<?, ?it/s]

In [29]:
# We've already seen the 'metric.compute' method, but the metrics can accumulate batches for us as we go over the prediction loop with the 'add_batch' method
# Once we have accumulated all the batches, we can get the final result with 'metric.compute()'

import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    metric.add_batch(predictions = predictions, references = batch['labels'])
    
metric.compute()
# results will vary slightly because of randomness in the model head initilization and the data shuffling but they should be in the same ballpark

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.7769607843137255, 'f1': 0.8460236886632826}

In [None]:
# Exercise

# Modify the previous training loop to fine-tune your model on the SST-2 dataset.

# I'll come back to it 

In [None]:
# lastly we talked about the 'Accelerate' library which can enable dstribution on multiple GPU's or TPU's
# Here's the code to run it
# Note: Don't run
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# The above is to be made into a 'train.py' file
# to run the 'train.py' simply enter the command 'accelerate config' on a terminal
# it'll prompt you to answer a few questions and dump your answers in a configuration file used by this command 'accelerate launch train.py'

# If you want to run it in a notebook like this one, use this: 
# Note: Don't run
from accelerate import notebook_launcher

notebook_launcher(training_function)