# 1. Dataset and Model

## 1.1. Random Dataset
This randomly generate the input and output data with given sizes and also the number of samples.

It uses `torch.manual_seed` to generate same data for consistensy on different experiments

A data loader then uses this dataset and create the data batches with given batch size.

In [90]:
import torch
from torch.utils.data import Dataset, DataLoader

class RandomTensorDataset(Dataset):
  def __init__(self, num_samples, in_shape, out_shape):
    self.num_samples = num_samples
    torch.manual_seed(12345)
    self.data = [(torch.randn(in_shape), torch.randn(out_shape)) for _ in range(num_samples)]

  def __len__(self):
    return self.num_samples

  def __getitem__(self, idx):
    return self.data[idx]

input_size  = 6
output_size = 2

# dataset construction
num_samples = 64
dataset = RandomTensorDataset(
  num_samples=num_samples,
  in_shape=input_size,
  out_shape=output_size
  )

batch_size  = 32 # two batches
dataloader = DataLoader(
  dataset,
  batch_size=batch_size,
  pin_memory=True,
  shuffle=False
  )

## 1.2. A Simple 2-Layer MLP model

In [91]:
import torch
import torch.nn as nn
import torch.optim as optim

class MLP(nn.Module):
  def __init__(self, in_feature, hidden_units, out_feature):
    super().__init__()
    torch.manual_seed(12345)
    self.hidden_layer = nn.Linear(in_feature, hidden_units)
    self.output_layer = nn.Linear(hidden_units, out_feature)

  def forward(self, x):
    x = self.hidden_layer(x)
    x = self.output_layer(x)
    return x

device = 'cuda' if torch.cuda.is_available() else 'cpu' # Using single GPU (GPU 0) if available otherwise CPU

# model construction
layer_1_units = input_size
layer_2_units = 4
layer_3_units = output_size
model = MLP(
  in_feature=layer_1_units,
  hidden_units=layer_2_units,
  out_feature=layer_3_units
  ).to(device)

loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(),lr=0.01)

# 2. Run One Epoch On A Single Device
The single device will train the model on the both data batches. Later we simulate the DDP behaviour.

In [92]:
# One iteration using PyTorch
print(f'Using {device} For One Iteration of Forward and Backward Passes Using PyTorch')
train_loss = 0
for x, y in dataloader:
  x = x.to(device)
  y = y.to(device)

  # Forward Pass
  out = model(x)

  # Calculate loss
  loss = loss_fn(out, y)
  train_loss += loss

  # Zero grad
  optimizer.zero_grad(set_to_none=True)

  # Backward Pass
  loss.backward()

  # Update Model
  optimizer.step()

print(f'epoch 1 | {train_loss=}')

Using cuda For One Iteration of Forward and Backward Passes Using PyTorch
epoch 1 | train_loss=tensor(2.1911, device='cuda:0', grad_fn=<AddBackward0>)


# 3. Simulating DDP on Two Devices
* Divide the dataset into two separate parts (one batch each)
* Create two instances of the model, one for each device
* Train each model replica on the separate data batches
* All-reduce the gradient
* Update the model replicas.




## 3.1 Simulate Dividing the Dataset in DDP 
Each device gets half of the dataset (one batch each in this case).

In [93]:
iterator = iter(dataloader)
##### Device 0 (GPU 0)
x_0, y_0 = next(iterator)

##### Device 1 (GPU 1)
x_1, y_1 = next(iterator)

print(f'x_0.shape: {x_0.shape}, y_0.shape: {y_0.shape}\nx_1.shape: {x_1.shape}, y_1.shape: {y_1.shape}')

x_0.shape: torch.Size([32, 6]), y_0.shape: torch.Size([32, 2])
x_1.shape: torch.Size([32, 6]), y_1.shape: torch.Size([32, 2])


## 3.2. Copy The Model For Each Device

In [81]:
####################################### Device 0 (GPU 0)
ddp_model_0 = MLP(
  in_feature=layer_1_units,
  hidden_units=layer_2_units,
  out_feature=layer_3_units
  ).to(device)
optimizer_0 = optim.SGD(ddp_model_0.parameters(),lr=0.01)

####################################### Device 1 (GPU 1)
ddp_model_1 = MLP(
  in_feature=layer_1_units,
  hidden_units=layer_2_units,
  out_feature=layer_3_units
  ).to(device)
optimizer_1 = optim.SGD(ddp_model_1.parameters(),lr=0.01)

## 3.3. Simulate the DDP Forward and Backward Computation

In [82]:
####################################### Device 0 (GPU 0)
x_0 = x_0.to(device)
y_0 = y_0.to(device)

# Forward Pass
out_0 = ddp_model_0(x_0)

# Calculate loss
loss_0 = loss_fn(out_0, y_0)

# Zero grad
optimizer_0.zero_grad(set_to_none=True)

# Backward Pass
loss_0.backward() # local gradients


####################################### Device 1 (GPU 1)
x_1 = x_1.to(device)
y_1 = y_1.to(device)

# Forward Pass
out_1 = ddp_model_1(x_1)

# Calculate loss
loss_1 = loss_fn(out_1, y_1)

# Zero grad
optimizer_1.zero_grad(set_to_none=True)

# Backward Pass
loss_1.backward() # local gradients

# 3.4. Simulate DDP Gradients All-Reduce

In [83]:
####################################### Device 0 (GPU 0)
# hidden layer parameters
W1_0 = ddp_model_0.hidden_layer.weight
b1_0 = ddp_model_0.hidden_layer.bias

# output layer parameters
W2_0 = ddp_model_0.output_layer.weight
b2_0 = ddp_model_0.output_layer.bias

####################################### Device 1 (GPU 1)
# hidden layer parameters
W1_1 = ddp_model_1.hidden_layer.weight
b1_1 = ddp_model_1.hidden_layer.bias

# output layer parameters
W2_1 = ddp_model_1.output_layer.weight
b2_1 = ddp_model_1.output_layer.bias

###################################### All-Reduce Gradients
W1_0.grad = (W1_0.grad + W1_1.grad)/2
W1_1.grad = W1_0.grad
b1_0.grad = (b1_0.grad + b1_1.grad)/2
b1_1.grad = b1_0.grad
W2_0.grad = (W2_0.grad + W2_1.grad)/2
W2_1.grad = W2_0.grad
b2_0.grad = (b2_0.grad + b2_1.grad)/2
b2_1.grad = b2_0.grad

## 3.5. Update The Model Replicas on Each Device
Update each model replicas and compare them to make sure model replicas are consistent across the two devices

In [84]:
# Update each model replicas
optimizer_0.step()
optimizer_1.step()

In [96]:
# Function to compare two tensors
def cmp(s, t1, t2):
  ex = torch.all(t1 == t2).item()
  app = torch.allclose(t1, t2)
  maxdiff = (t1 - t2).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

cmp('W1', W1_0, W1_1)
cmp('b1', b1_0, b1_1)
cmp('W2', W2_0, W2_1)
cmp('b2', b2_0, b2_1)

W1              | exact: True  | approximate: True  | maxdiff: 0.0
b1              | exact: True  | approximate: True  | maxdiff: 0.0
W2              | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0
