In [49]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

class ChunkSampler(sampler.Sampler):
    """Samples elements sequentially from some offset. 
    Arguments:
        num_samples: # of desired datapoints
        start: offset where we should start selecting from
    """
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples

NUM_TRAIN = 49000
NUM_VAL = 1000

cifar10_train = dset.CIFAR10('./cs231n/datasets', train=True, download=True,
                           transform=T.ToTensor())
loader_train = DataLoader(cifar10_train, batch_size=64, sampler=ChunkSampler(NUM_TRAIN, 0))

cifar10_val = dset.CIFAR10('./cs231n/datasets', train=True, download=True,
                           transform=T.ToTensor())
loader_val = DataLoader(cifar10_val, batch_size=64, sampler=ChunkSampler(NUM_VAL, NUM_TRAIN))

cifar10_test = dset.CIFAR10('./cs231n/datasets', train=False, download=True,
                          transform=T.ToTensor())
loader_test = DataLoader(cifar10_test, batch_size=64)

dtype = torch.FloatTensor # the CPU datatype

# Constant to control how frequently we print train loss
print_every = 100

# This is a little utility that we'll use to reset the model
# if we want to re-initialize all our parameters
def reset(m):
    if hasattr(m, 'reset_parameters'):
        m.reset_parameters()

class Flatten(nn.Module):
    def forward(self, x):
        N, C, H, W = x.size() # read in N, C, H, W
        return x.view(N, -1)  # "flatten" the C * H * W values into a single vector per image

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [52]:
gpu_dtype = torch.cuda.FloatTensor

def check_accuracy(model, loader):
    if loader.dataset.train:
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')   
    num_correct = 0
    num_samples = 0
    model.eval() # Put the model in test mode (the opposite of model.train(), essentially)
    for x, y in loader:
        x_var = Variable(x.type(gpu_dtype), volatile=True)

        scores = model(x_var)
        _, preds = scores.data.cpu().max(1)
        num_correct += (preds == y).sum()
        num_samples += preds.size(0)
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

def train(model, loss_fn, optimizer, num_epochs = 1):
    for epoch in range(num_epochs):
        print('Starting epoch %d / %d' % (epoch + 1, num_epochs))
        check_accuracy(model, loader_val)
        model.train()
        for t, (x, y) in enumerate(loader_train):
            x_var = Variable(x.type(gpu_dtype))
            y_var = Variable(y.type(gpu_dtype).long())

            scores = model(x_var)
            
            loss = loss_fn(scores, y_var)
            if (t + 1) % print_every == 0:
                print('t = %d, loss = %.4f' % (t + 1, loss.data[0]))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

class NormReLU(nn.Module):
    def __init__(self, num_features):
        super(NormReLU, self).__init__()
        self.bn = nn.BatchNorm2d(num_features)
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        h_bn = self.bn(x)
        return self.relu(h_bn)

# def NormRelu(num_features):
#     return nn.Sequential(
#         nn.BatchNorm2d(num_features),
#         nn.ReLU(inplace=True)
#     )
# NormRelu = nn.Sequential()

In [37]:
model = nn.Sequential(
    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.MaxPool2d(2, stride=2),
    Flatten(),
    nn.Linear(4096, 1024),
    NormReLU(1024),
    nn.Linear(1024, 1024),
    NormReLU(1024),
    nn.Linear(1024, 10),
).type(gpu_dtype)

loss_fn = nn.CrossEntropyLoss().type(gpu_dtype)
optimizer = optim.Adam(model.parameters())

train(model, loss_fn, optimizer, num_epochs=20)
check_accuracy(model, loader_val)

Starting epoch 1 / 20
t = 100, loss = 1.3083
t = 200, loss = 1.1963
t = 300, loss = 0.9826
t = 400, loss = 0.7789
t = 500, loss = 0.7385
t = 600, loss = 0.8927
t = 700, loss = 1.1731
Starting epoch 2 / 20
t = 100, loss = 0.5525
t = 200, loss = 0.7502
t = 300, loss = 0.7011
t = 400, loss = 0.5488
t = 500, loss = 0.5556
t = 600, loss = 0.5953
t = 700, loss = 0.8116
Starting epoch 3 / 20
t = 100, loss = 0.3301
t = 200, loss = 0.5374
t = 300, loss = 0.4179
t = 400, loss = 0.3307
t = 500, loss = 0.3622
t = 600, loss = 0.3075
t = 700, loss = 0.5627
Starting epoch 4 / 20
t = 100, loss = 0.2026
t = 200, loss = 0.2604
t = 300, loss = 0.2310
t = 400, loss = 0.1658
t = 500, loss = 0.1967
t = 600, loss = 0.2934
t = 700, loss = 0.3703
Starting epoch 5 / 20
t = 100, loss = 0.0918
t = 200, loss = 0.0783
t = 300, loss = 0.0881
t = 400, loss = 0.1317
t = 500, loss = 0.1243
t = 600, loss = 0.1869
t = 700, loss = 0.3971
Starting epoch 6 / 20
t = 100, loss = 0.1143
t = 200, loss = 0.1283
t = 300, loss = 0

In [39]:
model = nn.Sequential(
    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    NormReLU(64),
    nn.MaxPool2d(2, stride=2),
    nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
    NormReLU(128),
    nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
    NormReLU(128),
    nn.MaxPool2d(2, stride=2),
    Flatten(),
    nn.Linear(2048, 1024),
    NormReLU(1024),
    nn.Linear(1024, 1024),
    NormReLU(1024),
    nn.Linear(1024, 10),
).type(gpu_dtype)

loss_fn = nn.CrossEntropyLoss().type(gpu_dtype)
optimizer = optim.Adam(model.parameters())

train(model, loss_fn, optimizer, num_epochs=5)
check_accuracy(model, loader_val)

Starting epoch 1 / 5
t = 100, loss = 1.2715
t = 200, loss = 1.2823
t = 300, loss = 1.1503
t = 400, loss = 0.7735
t = 500, loss = 0.9118
t = 600, loss = 0.8282
t = 700, loss = 1.0795
Starting epoch 2 / 5
t = 100, loss = 0.6075
t = 200, loss = 0.7615
t = 300, loss = 0.8118
t = 400, loss = 0.6404
t = 500, loss = 0.6070
t = 600, loss = 0.4910
t = 700, loss = 0.7292
Starting epoch 3 / 5
t = 100, loss = 0.3637
t = 200, loss = 0.5214
t = 300, loss = 0.6097
t = 400, loss = 0.5482
t = 500, loss = 0.4808
t = 600, loss = 0.3414
t = 700, loss = 0.5616
Starting epoch 4 / 5
t = 100, loss = 0.2474
t = 200, loss = 0.4462
t = 300, loss = 0.4560
t = 400, loss = 0.3813
t = 500, loss = 0.3232
t = 600, loss = 0.2337
t = 700, loss = 0.4190
Starting epoch 5 / 5
t = 100, loss = 0.1474
t = 200, loss = 0.2500
t = 300, loss = 0.3246
t = 400, loss = 0.2570
t = 500, loss = 0.2150
t = 600, loss = 0.1830
t = 700, loss = 0.2991
Checking accuracy on validation set
Got 736 / 1000 correct (73.60)


In [None]:
class ResBlock(nn.Module):
    def __init__(self, num_filters, channels_in=None, stride=1, res=True):
        super(ResBlock, self).__init__()
        
        if not channels_in:
            channels_in = num_filters
            self.projection = None
        else:
            self.projection = nn.Conv2d(channels_in, num_filters, kernel_size=1, stride=stride)
        self.res = res

        self.conv1 = nn.Conv2d(channels_in, num_filters, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(num_filters)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(num_filters, num_filters, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(num_filters)
        self.relu2 = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.projection:
            residual = self.projection(x)
        if self.res:
            out += x
        out = self.relu2(out)
        return out

In [56]:
# way overkill.. I should learn to instinctively plot train/val accuracy since this was probably overfitting

model = nn.Sequential(
    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
    nn.ReLU(inplace=True),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(128, channels_in=64, stride=2),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(128),
    ResBlock(256, channels_in=128, stride=2),
    ResBlock(256),
    ResBlock(256),
    ResBlock(256),
    ResBlock(256),
    nn.AvgPool2d(8),
    Flatten(),
    nn.Linear(256, 10),
).type(gpu_dtype)

total = 0
for p in model.parameters():
#     print(p.size())
    total += np.prod(list(p.size()))
print(total)

loss_fn = nn.CrossEntropyLoss().type(gpu_dtype)
optimizer = optim.Adam(model.parameters())

train(model, loss_fn, optimizer, num_epochs=20)
check_accuracy(model, loader_val)

14159626
Starting epoch 1 / 20
Checking accuracy on validation set
Got 79 / 1000 correct (7.90)
t = 100, loss = 1.9811
t = 200, loss = 1.8765
t = 300, loss = 1.7176
t = 400, loss = 1.3571
t = 500, loss = 1.4340
t = 600, loss = 1.5515
t = 700, loss = 1.5966
Starting epoch 2 / 20
Checking accuracy on validation set
Got 426 / 1000 correct (42.60)
t = 100, loss = 1.2228
t = 200, loss = 1.3435
t = 300, loss = 1.3727
t = 400, loss = 0.9142
t = 500, loss = 1.1360
t = 600, loss = 1.0571
t = 700, loss = 1.1566
Starting epoch 3 / 20
Checking accuracy on validation set
Got 655 / 1000 correct (65.50)
t = 100, loss = 0.9211
t = 200, loss = 1.1600
t = 300, loss = 1.1699
t = 400, loss = 0.8681
t = 500, loss = 0.9561
t = 600, loss = 0.7822
t = 700, loss = 0.9473
Starting epoch 4 / 20
Checking accuracy on validation set
Got 608 / 1000 correct (60.80)
t = 100, loss = 0.7746
t = 200, loss = 0.8314
t = 300, loss = 0.7940
t = 400, loss = 0.6763
t = 500, loss = 0.7229
t = 600, loss = 0.5656
t = 700, loss = 

In [59]:
# resnet paper does not use this on CIFAR-10, I assume since the number of filters is already small
class BottleneckBlock(nn.Module):
    def __init__(self, num_filters, channels_in=None, stride=1, res=True):
        super(ResBlock, self).__init__()
        
        if not channels_in:
            channels_in = num_filters
            self.projection = None
        else:
            self.projection = nn.Conv2d(channels_in, num_filters, kernel_size=1, stride=stride)
        self.res = res

        self.conv1 = nn.Conv2d(channels_in, num_filters//4, kernel_size=1, stride=stride)
        self.bn1 = nn.BatchNorm2d(num_filters//4)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(num_filters//4, num_filters//4, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(num_filters//4)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv2d(num_filters//4, num_filters, kernel_size=1, stride=1)
        self.bn3 = nn.BatchNorm2d(num_filters)
        self.relu3 = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.projection:
            residual = self.projection(x)
        if self.res:
            out += x
        out = self.relu3(out)
        return out

In [79]:
# add dropout
class ResBlock(nn.Module):
    def __init__(self, num_filters, channels_in=None, stride=1, res=True):
        super(ResBlock, self).__init__()
        
        if not channels_in:
            channels_in = num_filters
            self.projection = None
        else:
            self.projection = nn.Conv2d(channels_in, num_filters, kernel_size=1, stride=stride)
        self.res = res

        self.conv1 = nn.Conv2d(channels_in, num_filters, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(num_filters)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(num_filters, num_filters, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(num_filters)
#         self.dropout = nn.Dropout(inplace=True)
        self.relu2 = nn.ReLU(inplace=True)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.conv2(out)
        out = self.bn2(out)
#         out = self.dropout(out)
        if self.projection:
            residual = self.projection(x)
        if self.res:
            out += residual
        out = self.relu2(out)
        return out

In [87]:
model = nn.Sequential(
    nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
    nn.BatchNorm2d(16),
    nn.ReLU(inplace=True),
    ResBlock(16),
    ResBlock(16),
    ResBlock(16),
    ResBlock(16),
    ResBlock(16),
    ResBlock(16),
    ResBlock(16),
    ResBlock(32, channels_in=16, stride=2),
    ResBlock(32),
    ResBlock(32),
    ResBlock(32),
    ResBlock(32),
    ResBlock(32),
    ResBlock(32),
    ResBlock(64, channels_in=32, stride=2),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    ResBlock(64),
    nn.AvgPool2d(8),
    Flatten(),
    nn.Linear(64, 10),
).type(gpu_dtype)

# print parameter count / model complexity
print(sum([np.prod(list(p.size())) for p in model.parameters()]))

loss_fn = nn.CrossEntropyLoss().type(gpu_dtype)

# hacky lr schedule
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
train(model, loss_fn, optimizer, num_epochs=15)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)
train(model, loss_fn, optimizer, num_epochs=10)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0001)
train(model, loss_fn, optimizer, num_epochs=5)
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9, weight_decay=0.0001)
train(model, loss_fn, optimizer, num_epochs=5)

check_accuracy(model, loader_val)

662826
Starting epoch 1 / 15
Checking accuracy on validation set
Got 112 / 1000 correct (11.20)
t = 100, loss = 2.0076
t = 200, loss = 2.0408
t = 300, loss = 1.7891
t = 400, loss = 1.6665
t = 500, loss = 1.6167
t = 600, loss = 1.8515
t = 700, loss = 1.5904
Starting epoch 2 / 15
Checking accuracy on validation set
Got 361 / 1000 correct (36.10)
t = 100, loss = 1.5047
t = 200, loss = 1.3869
t = 300, loss = 1.4269
t = 400, loss = 1.0814
t = 500, loss = 1.2948
t = 600, loss = 1.3055
t = 700, loss = 1.4877
Starting epoch 3 / 15
Checking accuracy on validation set
Got 546 / 1000 correct (54.60)
t = 100, loss = 1.1177
t = 200, loss = 1.0946
t = 300, loss = 1.2080
t = 400, loss = 0.9127
t = 500, loss = 1.2242
t = 600, loss = 1.0198
t = 700, loss = 1.2012
Starting epoch 4 / 15
Checking accuracy on validation set
Got 621 / 1000 correct (62.10)
t = 100, loss = 0.8366
t = 200, loss = 1.0502
t = 300, loss = 1.0741
t = 400, loss = 0.9053
t = 500, loss = 1.0073
t = 600, loss = 0.9962
t = 700, loss = 

t = 500, loss = 0.2716
t = 600, loss = 0.2016
t = 700, loss = 0.2692
Starting epoch 9 / 10
Checking accuracy on validation set
Got 880 / 1000 correct (88.00)


KeyboardInterrupt: 