## Gradient Checkpointing

* memory optimization technique used in deep learning to reduce memory consumption during backpropagation.
* Storing intermediate activations only at these checkpoints.
* It introduces computational overhead.

In [2]:
import torch
from torch.utils.checkpoint import checkpoint

# Define a custom model with checkpointed layers
class CustomModel(torch.nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.layer1 = torch.nn.Linear(512, 256)
        self.layer2 = torch.nn.Linear(256, 128)
    
    def forward(self, x):
        # Checkpoint the computation in layer1
        x = checkpoint(self.layer1, x)
        x = torch.relu(x)
        x = self.layer2(x)
        return x

# Instantiate and train the model
model = CustomModel()

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()

### Gradient Accumulation

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()


In [None]:
import torch
from torch import nn, optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast

In [None]:
# Define your model architecture
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10))

    def forward(self, x):
        x = self.flatten(x)
        logit = self.linear_relu_stack(x)
        return logit

In [None]:
# Load your dataset (for example, MNIST)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_data = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
# Initialize your model, optimizer, and scaler
model = MyModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = GradScaler()

In [None]:
scaler = GradScaler()
iters_to_accumulate = 4

for epoch in range(5):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Move data to GPU
        data, target = data.to(device), target.to(device)

        # Zero the gradients
        optimizer.zero_grad()
        with autocast():
            output = model(data)
            loss = nn.functional.cross_entropy(output, target)
            # normalize the loss 
            loss = loss / iters_to_accumulate

        # Accumulates scaled gradients.
        scaler.scale(loss).backward()
        # weights update
        if (batch_idx + 1) % iters_to_accumulate == 0:
            # may unscale_ here if desired 
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()