# TAKEN FROM https://huggingface.co/docs/transformers/accelerate

In [1]:
!pip install --upgrade --quiet transformers datasets evaluate soundfile librosa  


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.7/260.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Distributed training with 🤗 Accelerate
As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the 🤗 Accelerate library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU’s on one machine or multiple GPU’s across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.

In [2]:
!pip install --upgrade --quiet accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h

In [3]:
from accelerate import Accelerator

accelerator = Accelerator()

# Prepare to accelerate
The next step is to pass all the relevant training objects to the prepare method. This includes your training and evaluation DataLoaders, a model and an optimizer:

In [14]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
import torch.nn as nn
import torch.optim as optim
from accelerate import Accelerator
from torch.optim.lr_scheduler import StepLR
from tqdm.auto import tqdm

# Custom collate function to convert each batch from a tuple to a dictionary
def custom_collate(batch):
    images, labels = zip(*batch)
    images = torch.stack(images)
    labels = torch.tensor(labels)
    return {"x": images, "labels": labels}

# Initialize Accelerator
accelerator = Accelerator()

# Download and load the MNIST dataset
dataset = MNIST(root="./data", download=True, transform=ToTensor())

# Split the dataset into training (80%) and evaluation (20%) sets
train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])

# Create dataloaders with the custom collate function
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate)
eval_dataloader = DataLoader(eval_dataset, batch_size=64, shuffle=False, collate_fn=custom_collate)

# Define a simple class to wrap the output so that you can use .loss in your training loop
class ModelOutput:
    def __init__(self, loss, logits):
        self.loss = loss
        self.logits = logits

# Define a simple neural network model for MNIST classification
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(28 * 28, 10)  # MNIST images are 28x28 and there are 10 classes
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x, labels=None):
        x = self.flatten(x)
        logits = self.linear(x)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return ModelOutput(loss, logits)

model = SimpleNet()

# Define optimizer and learning rate scheduler
optimizer = optim.Adam(model.parameters(), lr=0.001)
lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.95)

In [15]:
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

In [18]:
progress_bar = tqdm(total=len(train_dataloader) * 10)  # assuming 100 epochs


  0%|          | 0/7500 [00:00<?, ?it/s]

In [19]:


# Training loop that uses model(**batch)
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        # Since our custom collate returns a dict with keys "x" and "labels",
        # this works with model(**batch)
        outputs = model(**batch)
        loss = outputs.loss
        
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    print(f"Epoch {epoch+1}/{num_epochs} completed")

Epoch 1/10 completed
Epoch 2/10 completed
Epoch 3/10 completed
Epoch 4/10 completed
Epoch 5/10 completed
Epoch 6/10 completed
Epoch 7/10 completed
Epoch 8/10 completed
Epoch 9/10 completed
Epoch 10/10 completed


In [29]:
def custom_collate(batch):
    processed_input_ids = []
    for item in batch:
        tensor = item[0]
        # If tensor has an extra dimension (e.g. shape: [1, seq_length]), remove it.
        if tensor.ndim > 1 and tensor.shape[0] == 1:
            tensor = tensor.squeeze(0)
        processed_input_ids.append(tensor)
    input_ids = torch.stack(processed_input_ids).long()
    # Now input_ids should be of shape (batch_size, sequence_length)
    attention_mask = torch.ones(input_ids.size(0), input_ids.size(1), dtype=torch.long)
    labels = torch.tensor([item[1] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate)
eval_dataloader = DataLoader(eval_dataset, batch_size=64, shuffle=False, collate_fn=custom_collate)


In [None]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

checkpoint = "distilbert-base-uncased"
num_labels = 2  # Binary classification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)


optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
     train_dataloader, eval_dataloader, model, optimizer)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
      #batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      #loss.backward()
      accelerator.backward(loss)

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)