## Ways to optimize memory footprint and improve performance

In [20]:
import sys
!{sys.executable} -m pip install -q transformers datasets accelerate nvidia-ml-py3 bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
from datasets import Dataset


seq_len, dataset_size = 512, 512
dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}
ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")

In [3]:
ds.shape

(512, 2)

In [4]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
print_gpu_utilization()

GPU memory occupied: 258 MB.


In [6]:
import torch

torch.ones((1, 1)).to("cuda")
print_gpu_utilization()

GPU memory occupied: 362 MB.


In [None]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
print_gpu_utilization()

In [8]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [9]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()

training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 185.4077, 'train_samples_per_second': 2.761, 'train_steps_per_second': 0.69, 'train_loss': 0.023786330595612526, 'epoch': 1.0}
Time: 185.41
Samples/second: 2.76
GPU memory occupied: 11538 MB.


A simple trick to effectively train larger batch size is gradient accumulation. We are not limited by GPU size but only our requirements.
## Gradient Accumulation

* instead of calculating the gradients for the whole batch at once to do it in smaller steps.
* The way we do that is to calculate the gradients iteratively in smaller batches by doing a forward and backward pass through the model and accumulating the gradients in the process.
* When enough gradients are accumulated we run the model’s optimization step.
* We can see that the memory footprint was dramatically reduced at the cost of being only slightly slower than the vanilla run.

In [10]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 204.5286, 'train_samples_per_second': 2.503, 'train_steps_per_second': 0.626, 'train_loss': 0.009834381751716137, 'epoch': 1.0}
Time: 204.53
Samples/second: 2.50
GPU memory occupied: 7222 MB.


In [14]:
## If we want to use GPU to its limit we can increase the batch_size and also enable gradient checkpointing to 16

training_args = TrainingArguments(per_device_train_batch_size=4,
                                  gradient_accumulation_steps=16,
                                  **default_args
                                  )

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 172.3912, 'train_samples_per_second': 2.97, 'train_steps_per_second': 0.046, 'train_loss': 1.492358569521457e-05, 'epoch': 1.0}
Time: 172.39
Samples/second: 2.97
GPU memory occupied: 12362 MB.


## Gradient Checkpointing

* In order to compute the gradients during the backward pass all activations from the forward pass are normally saved. This can create a big memory overhead.
* We can see that this saved some more memory but at the same time training became a bit slower. 
* A general rule of thumb is that gradient checkpointing slows down training by about 20%

In [15]:
training_args = TrainingArguments(
    per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 270.1824, 'train_samples_per_second': 1.895, 'train_steps_per_second': 0.474, 'train_loss': 3.655438973737546e-08, 'epoch': 1.0}
Time: 270.18
Samples/second: 1.90
GPU memory occupied: 6842 MB.


## FP16 Training (Mixed precision Training)
* The main advantage comes from saving the activations in half (16-bit) precision.
* Gradients calculated in half but converted back to float32.
* Just fp16 has 2 copies of model saved so hardly saves any memory.
* Also there is some overhead extra computation so not that great on time. 
* But add it with other methods and it performs good.

In [16]:
training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 92.4081, 'train_samples_per_second': 5.541, 'train_steps_per_second': 1.385, 'train_loss': 0.0, 'epoch': 1.0}
Time: 92.41
Samples/second: 5.54
GPU memory occupied: 6826 MB.


In [17]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    **default_args,
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 116.9351, 'train_samples_per_second': 4.378, 'train_steps_per_second': 1.095, 'train_loss': 0.0, 'epoch': 1.0}
Time: 116.94
Samples/second: 4.38
GPU memory occupied: 9416 MB.


## Optimizers
* Use optimizers like adaFactor which saved aggregated gradient and not rolling gradient and save some space.
* With everything we see a 3x memory reduction.
* One downside of Adafactor is that in some instances convergence can be slower than Adam’s.
* But as Adafactor is slow we can use 8-bit Adam as an alternate.

In [18]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    **default_args,
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 130.4237, 'train_samples_per_second': 3.926, 'train_steps_per_second': 0.981, 'train_loss': 0.0, 'epoch': 1.0}
Time: 130.42
Samples/second: 3.93
GPU memory occupied: 6460 MB.


## 8-bit Adam
* It stores the rolling average but quantizes it i.e less precision and dequantizes it only for the optimization
* install the **bitsandbytes** library that implements the 8-bit Adam optimizer.
* Group parameters into group that weight decay and that doesn't.
* Usually, biases and layer norm parameters are not weight decayed.

In [22]:
import bitsandbytes as bnb
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names

training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
print(len(decay_parameters), decay_parameters)

## Add weight_decay to all parameters which can decay. Other params will have decay 0.0
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

## Copy Beta values from training args
optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}

## Copy LR values from Training Args
optimizer_kwargs["lr"] = training_args.learning_rate
adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)

149 ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.value.weight', 'bert.encoder.layer.1.attention.output.dense.weight', 'bert.encoder.layer.1.intermediate.dense.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.2.attention.self.query.weight', 'bert.encoder.layer.2.attention.self.key.weight', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.encoder.layer.2.attention.output.dense.weight', 'bert.encoder.layer.2.intermediate.dense.weight

In [23]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    **default_args,
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
result = trainer.train()
print_summary(result)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


{'train_runtime': 112.4754, 'train_samples_per_second': 4.552, 'train_steps_per_second': 1.138, 'train_loss': 0.0, 'epoch': 1.0}
Time: 112.48
Samples/second: 4.55
GPU memory occupied: 4426 MB.


## Accelerate
*

In [24]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    **default_args,
)

In [27]:
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader

dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)

if training_args.gradient_checkpointing:
    model.gradient_checkpointing_enable()

## Define accelerator to use mixed precision
accelerator = Accelerator(mixed_precision="fp16")
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)

model.train()
for step, batch in enumerate(dataloader, start=1):
    loss = model(**batch).loss
    loss = loss / training_args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % training_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

print_gpu_utilization()

GPU memory occupied: 4612 MB.
