In [1]:
import os
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms


from typing import List, Tuple, Dict, Any, Union
from collections import OrderedDict

from resnet import build_resnet, resnet50, resnet100
from gpt_2 import build_GPT2, gpt2_small

DATA_DIR = './data'
CKPT_DIR = './checkpoint'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torchgpipe import GPipe as tGPipe
from MyGPipe import GPipe as SelfGPipe
from CleanParallel.parallel import GPipe as CleanGPipe

In [3]:
def split_module(
      module: torch.nn.Module, 
      partition_sizes: List[int], 
    ) -> nn.ModuleList:
    print('-' * 80)
    print('Splitting the module in MyGPipe...')
    layers = OrderedDict()
    partitions = []

    i = 0
    for name, layer in module.named_children():
        layers[name] = layer
        if len(layers) == partition_sizes[i]:
            # partitions.append(nn.Sequential(layers).to(devices[i]))
            partitions.append(nn.Sequential(layers))
            # print_mem_usage(devices[i])

            layers.clear()
            i += 1
    
    return torch.nn.ModuleList(partitions)

In [2]:
model = gpt2_small()
for k, v in model.named_children():
    print(k, v)

embedding Embedding(50257, 768)
pos_encoding PositionalEncoding()
blocks_0 GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=3072, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=3072, out_features=768, bias=True)
    (4): Dropout(p=0.1, inplace=False)
  )
)
blocks_1 GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=3072, bias=True)
    (1): GELU(approximate='none')

In [6]:
model = resnet50()
for k, v in model.named_children():
    print(k, v)

conv1 Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
bn1 BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
relu ReLU()
maxpool MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
layer1_0 ResidualBlock(
  (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
layer1_1 ResidualBlock(
  (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2)

In [3]:
print(len(model))

15


In [7]:
import torch.nn.functional as F
from torch.optim import SGD

x = torch.randn(1, 3, 224, 224)
target = torch.tensor([1])
with torch.no_grad():
    for name, layer in model.named_children():
        x = layer(x)
        print(name, '\t' , x.shape)
    optimizer = SGD(model.parameters(), lr=0.1)
    loss = F.cross_entropy(x, target)
print(f"Loss: {loss}")

conv1 	 torch.Size([1, 64, 112, 112])
bn1 	 torch.Size([1, 64, 112, 112])
relu 	 torch.Size([1, 64, 112, 112])
maxpool 	 torch.Size([1, 64, 56, 56])
layer1_0 	 torch.Size([1, 64, 56, 56])
layer1_1 	 torch.Size([1, 64, 56, 56])
layer1_2 	 torch.Size([1, 64, 56, 56])
layer2_0 	 torch.Size([1, 128, 28, 28])
layer2_1 	 torch.Size([1, 128, 28, 28])


layer2_2 	 torch.Size([1, 128, 28, 28])
layer2_3 	 torch.Size([1, 128, 28, 28])
layer3_0 	 torch.Size([1, 256, 14, 14])
layer3_1 	 torch.Size([1, 256, 14, 14])
layer3_2 	 torch.Size([1, 256, 14, 14])
layer3_3 	 torch.Size([1, 256, 14, 14])
layer3_4 	 torch.Size([1, 256, 14, 14])
layer3_5 	 torch.Size([1, 256, 14, 14])
layer4_0 	 torch.Size([1, 512, 7, 7])
layer4_1 	 torch.Size([1, 512, 7, 7])
layer4_2 	 torch.Size([1, 512, 7, 7])
avgpool 	 torch.Size([1, 512, 1, 1])
flat 	 torch.Size([1, 512])
fc 	 torch.Size([1, 10])
Loss: 1.5949032306671143


In [8]:
def build_train_stuffs(model: nn.Module, batch_size: int):
    # 1. define network
    # 2. define dataloader
    trainset = torchvision.datasets.CIFAR10(
        root=DATA_DIR,
        train=True,
        download=True,
        transform=transforms.Compose(
            [
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(
                    (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
                ),
            ]
        ),
    )
    train_loader = torch.utils.data.DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
    )

    # 3. define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=0.01,
        momentum=0.9,
        weight_decay=0.0001,
        nesterov=True,
    )

    return train_loader, criterion, optimizer
train_loader, criterion, optimizer = build_train_stuffs(model, 128)

Files already downloaded and verified


In [10]:
num_epoches = 1
batch_size = 128
num_samples = 10
ckpt_save_path = os.path.join(CKPT_DIR, 'resnet_model.bin')
# model = pipeline_model(model, gpipe_type=gpipe_type)

print("Profiling...")
model.train()
# trace_dir = os.path.join(TRACE_DIR, gpipe_type)
for ep in range(1, num_epoches + 1):
    train_loss = correct = total = 0

    for idx, (inputs, targets) in enumerate(train_loader):
        if idx == num_samples:
            break
        # inputs, targets = inputs.cuda(), targets.cuda() if torch.cuda.is_available() else inputs, targets

        outputs = inputs
        for layer in model:
            outputs = layer(outputs)

        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        if (idx + 1) % 50 == 0 or (idx + 1) == len(train_loader):
            print(
                "   == step: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
                    idx + 1,
                    len(train_loader),
                    ep,
                    num_epoches,
                    train_loss / (idx + 1),
                    100.0 * correct / total,
                )
            )
    # save checkpoint
    torch.save(model.state_dict(), ckpt_save_path)
    

Profiling...


RuntimeError: Parent directory ./checkpoint does not exist.