In [1]:
import os
import torch
import json
import copy
import numpy as np
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import logging
import random
import datetime as dt

In [2]:
_cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


def _make_layers(cfg):
    layers = []
    in_channels = 3
    for layer_cfg in cfg:
        if layer_cfg == 'M':
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        else:
            layers.append(nn.Conv2d(in_channels=in_channels,
                                    out_channels=layer_cfg,
                                    kernel_size=3,
                                    stride=1,
                                    padding=1,
                                    bias=True))
            layers.append(nn.BatchNorm2d(num_features=layer_cfg))
            layers.append(nn.ReLU(inplace=True))
            in_channels = layer_cfg
    return nn.Sequential(*layers)


class _VGG(nn.Module):
    """
    VGG module for 3x32x32 input, 10 classes
    """

    def __init__(self, name='VGG11'):
        super(_VGG, self).__init__()
        cfg = _cfg[name]
        self.layers = _make_layers(cfg)
        flatten_features = 512
        self.fc1 = nn.Linear(flatten_features, 10)

    def forward(self, x):
        y = self.layers(x)
        y = y.view(y.size(0), -1)
        y = self.fc1(y)
        return y


def VGG11():
    return _VGG('VGG11')


In [3]:
# from google.colab import files
# files.upload()

# import model as mdl

In [4]:
torch.set_num_threads(4)
# device = "cpu"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
batch_size = 256 # batch for one node

In [6]:
def train_model(model, train_loader, optimizer, criterion, epoch):
    """
    model (torch.nn.module): The model created to train
    train_loader (pytorch data loader): Training data loader
    optimizer (optimizer.*): A instance of some sort of optimizer, usually SGD
    criterion (nn.CrossEntropyLoss) : Loss function used to train the network
    epoch (int): Current epoch number
    """
    epoch_startTime = dt.datetime.now()
    running_loss = 0.0
    # remember to exit the train loop at   end of the epoch
    for batch_idx, (data, target) in enumerate(train_loader):
        # Your code goes here!

        batch_startTime = dt.datetime.now()
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        msg_iteration = 20
        if batch_idx % msg_iteration == msg_iteration-1:    # print every $msg_iteration mini-batches
            batch_endTime = dt.datetime.now()
            batch_time = "{:.2f}".format((batch_endTime - batch_startTime).total_seconds())
            print(f'epoch : {epoch + 1} batch_no:{batch_idx + 1:5d} MeanLoss_last_{msg_iteration}_batches: {running_loss / msg_iteration:.3f} current_batch_time: {batch_time} secs')
            running_loss = 0.0

        # break
    epoch_endTime = dt.datetime.now()
    epoch_time = "{:.2f}".format((epoch_endTime - epoch_startTime).total_seconds())
    print("Time taken for epoch : ",epoch + 1," = ",epoch_time," secs" )
    ckp = model.state_dict()
    PATH = "checkpoint.pt"
    torch.save(ckp, PATH)
    print(f"Epoch {epoch + 1} | Training checkpoint saved at {PATH}")

    return None

In [7]:
def test_model(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target)
            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))

In [8]:
def main():
    normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
                                std=[x/255.0 for x in [63.0, 62.1, 66.7]])
    transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
            ])

    transform_test = transforms.Compose([
            transforms.ToTensor(),
            normalize])
    training_set = datasets.CIFAR10(root="./data", train=True,
                                                download=True, transform=transform_train)
    train_loader = torch.utils.data.DataLoader(training_set,
                                                    num_workers=2,
                                                    batch_size=batch_size,
                                                    sampler=None,
                                                    shuffle=True,
                                                    pin_memory=True)
    test_set = datasets.CIFAR10(root="./data", train=False,
                                download=True, transform=transform_test)

    test_loader = torch.utils.data.DataLoader(test_set,
                                              num_workers=2,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              pin_memory=True)
    training_criterion = torch.nn.CrossEntropyLoss().to(device)

    model = VGG11()
    model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.001,
                          momentum=0.9, weight_decay=0.0001)
    # running training for one epoch
    for epoch in range(30):
        train_model(model, train_loader, optimizer, training_criterion, epoch)
        test_model(model, test_loader, training_criterion)

In [9]:
if __name__ == "__main__":
    main()

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 45689962.39it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
epoch : 1 batch_no:   20 MeanLoss_last_20_batches: 2.092 current_batch_time: 0.05 secs
epoch : 1 batch_no:   40 MeanLoss_last_20_batches: 1.743 current_batch_time: 0.05 secs
epoch : 1 batch_no:   60 MeanLoss_last_20_batches: 1.560 current_batch_time: 0.05 secs
epoch : 1 batch_no:   80 MeanLoss_last_20_batches: 1.490 current_batch_time: 0.05 secs
epoch : 1 batch_no:  100 MeanLoss_last_20_batches: 1.399 current_batch_time: 0.05 secs
epoch : 1 batch_no:  120 MeanLoss_last_20_batches: 1.334 current_batch_time: 0.05 secs
epoch : 1 batch_no:  140 MeanLoss_last_20_batches: 1.309 current_batch_time: 0.05 secs
epoch : 1 batch_no:  160 MeanLoss_last_20_batches: 1.257 current_batch_time: 0.05 secs
epoch : 1 batch_no:  180 MeanLoss_last_20_batches: 1.226 current_batch_time: 0.04 secs
Time taken for epoch :  1  =  30.00  secs
Epoch 1 | Training checkpoint saved at checkpoint.pt
Test set: Average loss: 1.1641, A