In [1]:
'''ResNet in PyTorch.
For Pre-activation ResNet, see 'preact_resnet.py'.
Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2,2,2,2])

def ResNet34():
    return ResNet(BasicBlock, [3,4,6,3])

def ResNet50():
    return ResNet(Bottleneck, [3,4,6,3])

def ResNet101():
    return ResNet(Bottleneck, [3,4,23,3])

def ResNet152():
    return ResNet(Bottleneck, [3,8,36,3])

In [8]:
import torch
import torchvision
import torchvision.transforms as transforms

torch.cuda.empty_cache()
device      = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# The size of each batch.
BATCH_SIZE = 32

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomVerticalFlip(),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Training data split
trainset    = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)

# Testing data split
testset     = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader  = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

classes     = ('plane', 'car', 'bird', 'cat',
               'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [3]:
def test(epoch, net, criterion):
    device      = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    start = time.time()
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    losses = []
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            losses.append(loss.item())
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        stop = time.time()
        print('[Time: %4.3fs][epoch %3d] Loss: %.3f | Acc: %.3f%% (%d/%d)'%(stop - start, epoch, test_loss/(batch_idx+1), 100.*correct/total, correct, total))
    
    # Return the accuracy
    return 100.*correct/total

In [10]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import math
import time

class Runtime_Info(object):
    def __init__(self, acc_progression, all_losses, runtime, pre_time, pro_time, com_time, rounds, accuracy, network):
        self.acc_progression  = acc_progression
        self.all_losses       = all_losses
        self.runtime          = runtime
        self.pre_time         = pre_time
        self.pro_time         = pro_time
        self.com_time         = com_time
        self.rounds           = rounds
        self.accuracy         = accuracy
        self.network          = network
        
    def __cmp__(self, other):
        if hasattr(other, 'accuracy'):
            return other.accuracy.__cmp__(self.accuracy)

def Run_Test(Batch_Size, Learning_Rate, Batch_Loops, max_epochs):
    # Prepare the dataset:
    torch.cuda.empty_cache()
    Minutes = 120
    
    # Create the network and send it to the GPU
    device      = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net         = ResNet18().cuda()
    net         = net.to(device)
    #net             = Net(Network_Size).cuda()
    #net             = net.to(device)
    #print("Cuda available? " + str(next(net.parameters()).is_cuda))

    # Criterion and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(net.parameters(), lr=Learning_Rate, momentum=0.9, weight_decay=5e-4)

    # Network parameters
    #max_epochs         = 10000
    running_loss       = 0.0
    preprocessing_time = 0. 
    processing_time    = 0.
    i                  = 0
    prInterval         = 10
    all_losses         = []
    epoch_times        = []
    acc_progression    = []

    # Event timing
    start = 0
    stop  = 0

    # Epoch timing
    beginning = 0
    end       = 0

    # Timing data for analysis
    runtime   = 0
    pre_time  = 0
    pro_time  = 0
    com_time  = 0
    rounds    = 0

    # Full training loop
    runtime   = time.time()
    for epoch in range(max_epochs):
        # Training
        print("[Epoch %d duration: %3.3fs][Total duration: %3.3fs]\n"% (epoch, float(end - beginning), float(time.time() - runtime)))
        beginning = time.time()
    
        for local_batch, local_labels in trainloader:
            start = time.time()
            inputs, labels = local_batch.to(device), local_labels.to(device)
        
            stop  = time.time()
            preprocessing_time += stop - start
            start = time.time()
        
            # Force five walks over this batch before evicting from GPU
            optimizer.zero_grad()
            
            for i in range(Batch_Loops):
                outputs = net(inputs)
                loss    = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
            del inputs, labels

            # print statistics
            running_loss += loss.item()
        
            # Record the loss for visualization of training
            all_losses.append(loss.item())
            i += 1
        
            stop = time.time()
            processing_time += stop - start
        
        end       = time.time()
        comm_time = (end- beginning) - processing_time - preprocessing_time
        print('[Time: %4.3fs][epoch %3d] compute time %.3f, comm. overhead %.3f, epoch tot. time %3.3f' %
                (stop - beginning, epoch + 1, processing_time, comm_time, end - beginning) )
        accuracy       = test(epoch, net, criterion)
    
        # Create a tupe of (Epoch Time, Processing Time, Preprocessing Time, Communication time, Overall time)
        epoch_times.append((end - beginning, processing_time, preprocessing_time, comm_time, end - runtime))
    
        # Keep record of the total portion of runtime each section requires
        pro_time += processing_time
        com_time += comm_time
        pre_time += preprocessing_time
        rounds   += 1
        #running_loss       = 0.0
        preprocessing_time  = 0
        processing_time     = 0
        #i                  = 0
    
        # Record accuracy for visualization of accuracy progression
        acc_progression.append(accuracy)
        if accuracy > 90 or float(time.time() - runtime) > (60 * Minutes):
            break

    # Calculate the final runtime
    total_time = time.time() - runtime
    accuracy   = max(acc_progression)
    print("Final accuracy over %d Epochs: %3.3f, Elapsed time: %3.3f"%(rounds, accuracy, total_time))
    return Runtime_Info(acc_progression, all_losses, runtime, pre_time, pro_time, com_time, rounds, accuracy, net.state_dict())

In [11]:
Run_Test(Batch_Size = 32, Learning_Rate = .001, Batch_Loops = 1, max_epochs = 150)

[Epoch 0 duration: 0.000s][Total duration: 0.000s]

[Time: 87.241s][epoch   1] compute time 84.398, comm. overhead 2.632, epoch tot. time 87.241
[Time: 5.521s][epoch   0] Loss: 1.418 | Acc: 50.020% (5002/10000)
[Epoch 1 duration: 87.241s][Total duration: 92.765s]

[Time: 86.999s][epoch   2] compute time 84.112, comm. overhead 2.660, epoch tot. time 86.999
[Time: 5.529s][epoch   1] Loss: 1.207 | Acc: 56.530% (5653/10000)
[Epoch 2 duration: 86.999s][Total duration: 185.296s]

[Time: 86.694s][epoch   3] compute time 83.813, comm. overhead 2.670, epoch tot. time 86.694
[Time: 5.519s][epoch   2] Loss: 1.096 | Acc: 60.990% (6099/10000)
[Epoch 3 duration: 86.694s][Total duration: 277.512s]

[Time: 87.106s][epoch   4] compute time 84.222, comm. overhead 2.659, epoch tot. time 87.106
[Time: 5.507s][epoch   3] Loss: 1.045 | Acc: 61.170% (6117/10000)
[Epoch 4 duration: 87.106s][Total duration: 370.129s]

[Time: 86.541s][epoch   5] compute time 83.667, comm. overhead 2.664, epoch tot. time 86.541


KeyboardInterrupt: 