In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[K     |████████████████████████████████| 720 kB 5.4 MB/s 
[K     |████████████████████████████████| 189 kB 32.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 27.6 MB/s 
[K     |████████████████████████████████| 46 kB 3.5 MB/s 
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
[K     |████████████████████████████████| 51 kB 297 kB/s 
[?25hMounted at /content/gdrive


In [2]:
from fastai.vision.all import *
from fastbook import *

matplotlib.rc('image', cmap='Greys')

In [3]:
path = untar_data(URLs.MNIST_SAMPLE)

In [4]:
URLs.MNIST_SAMPLE

'https://s3.amazonaws.com/fast-ai-sample/mnist_sample.tgz'

In [5]:
Path.BASE_PATH = path
path.ls()

(#3) [Path('valid'),Path('train'),Path('labels.csv')]

In [6]:
(path/'train').ls()

(#2) [Path('train/7'),Path('train/3')]

In [7]:
def get_img_loader(url, train_folder, valid_folder, presize, resize, bs):
  path = untar_data(url)
  dblock = DataBlock(
      blocks = (ImageBlock, CategoryBlock),
      splitter = GrandparentSplitter(train_name=train_folder, valid_name=valid_folder),
      get_items = get_image_files,
      get_y = parent_label,
      item_tfms = Resize(presize),
      batch_tfms = aug_transforms(min_scale=0.5, size=resize)
  )
  dls = dblock.dataloaders(path, bs=bs)
  return dls

In [8]:
dls = get_img_loader(URLs.MNIST_SAMPLE, 'train', 'valid', 28, 28, 64)
x_t, y_t = dls.train.one_batch()

torch.linalg.solve has its arguments reversed and does not return the LU factorization.
To get the LU factorization see torch.lu, which can be used with torch.lu_solve or torch.lu_unpack.
X = torch.solve(B, A).solution
should be replaced with
X = torch.linalg.solve(A, B) (Triggered internally at  ../aten/src/ATen/native/BatchLinearAlgebra.cpp:766.)
  ret = func(*args, **kwargs)


In [9]:
x_t.shape, y_t.shape

(torch.Size([64, 3, 28, 28]), torch.Size([64]))

In [10]:
def _single_conv(ch_in, ch_out, ks, stride=1, act=True, gammaZero=False):
        # do not reduce size due to ks mismatch
        padding = ks//2
        layers = [nn.Conv2d(ch_in, ch_out, ks, stride=stride, padding=padding)]
        # add batch norm to prevent activations from getting too high and adding some randomness to training
        bn = nn.BatchNorm2d(ch_out)
        if gammaZero:
            nn.init.zeros_(bn.weight.cuda())

        layers.append(bn)
        # check if this layer should have an activation - yes unless the final layer
        if act:
            layers.append(nn.ReLU())
        
        layers = nn.Sequential(*layers)
        return layers

In [11]:
class ResBlock(nn.Module):
    def __init__(self, ch_in, ch_out, stride=1):
        super().__init__()

        self.conv = self._resblock_conv(ch_in, ch_out, stride=stride)
        self.pool = self._return if stride == 1 else nn.AvgPool2d(stride, ceil_mode=True)
        self.id_conv = self._return if ch_in == ch_out else _single_conv(ch_in, ch_out, 1, act=False)
        self.relu = nn.ReLU()

    def _return(self, x):
        return x
    
    def _resblock_conv(self, ch_in, ch_out, stride=1, ks=3):
        conv_block = nn.Sequential(
            _single_conv(ch_in, ch_out//4, 1),
            _single_conv(ch_out//4, ch_out//4, ks, stride=stride), 
            _single_conv(ch_out//4, ch_out, 1, act=False, gammaZero=True)
        )
        return conv_block
    
    def forward(self, x):
        return self.relu(self.conv(x) + self.id_conv(self.pool(x)))

In [13]:
class ResNet(nn.Module):
    def __init__(self, layers, dls_out, expansion=1):
        super().__init__()
        # this contains the number of layers; the length of the array is one less than self.block_ch_sizes below
        self.layers = layers
        self.stem = self._stem([3, 32, 32, 64])
        self.block_ch_sizes = [64, 64, 128, 256, 512]
        # expand the number of channels by a scale factor if desired
        if expansion != 1:
            # do not change the first block to avoid size mismatches
            for i in range(1, len(self.block_ch_sizes)):
                self.block_ch_sizes[i] *= expansion
        self.res_layers = self._create_res_layers()

        self.ad_pool = nn.AdaptiveAvgPool2d(1)
        self.flatten = nn.Flatten()
        self.drop = nn.Dropout(0.4)
        self.fc1 = nn.Linear(self.block_ch_sizes[-1], dls_out*5)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(dls_out*5, dls_out)
        # self.sig = nn.Sigmoid()

    def _stem(self, sizes):
        stem = [
            *[_single_conv(sizes[i], sizes[i+1], 3, stride = 2 if i == 0 else 1)
                for i in range(len(sizes) - 1)
            ],
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        ]
        # print(stem)
        return nn.Sequential(*stem)
    
    def _create_res_layers(self):
        res_layers = []
        # loop through each layer which may be of different sizes (a layer is characterized by having the same number of channels)
        for i, n_layer in enumerate(self.layers):
            # get the input and output channels for the layer
            ch_in, ch_out = self.block_ch_sizes[i: i+2]
            # first layer follow a maxpool so stride 1
            stride = 1 if i == 0 else 2
            res_layers.append(
                nn.Sequential(*[
                    ResBlock(ch_in if j == 0 else ch_out, ch_out, stride = stride if j == 0 else 1)
                    for j in range(n_layer)
                ])
            )
        return nn.Sequential(*res_layers)
    
    def forward(self, x):
        x_stem = self.stem(x)
        # print(x_stem.shape)
        x_res = self.res_layers(x_stem)
        x_pre_dense = self.flatten(self.ad_pool(x_res))
        # print(x_pre_dense.shape)
        x_fc1 = self.relu(self.fc1(self.drop(x_pre_dense)))
        x_out = self.fc2(self.drop(x_fc1))

        return x_out


In [14]:
dls_out = dls.c
rn = ResNet([3, 4, 6, 3], dls_out, expansion=4)

In [22]:
def get_learner(dls, model, loss):
    learn = Learner(dls, model, loss_func=loss, metrics=accuracy)
    return learn


learn_rn = get_learner(dls, rn, LabelSmoothingCrossEntropyFlat())


In [17]:
learn_rn.fit_one_cycle(5, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.347908,0.79582,0.69578,00:55
1,0.31326,0.416281,0.913641,00:56
2,0.293462,0.345791,0.957802,00:58
3,0.282935,0.211013,0.995093,00:59
4,0.264067,0.202502,0.998037,00:58


## Train on the full MNIST dataset 
- First train with just a normal loss function
- Second increase epochs and train mixup
- Finally train label smoothing with mixup

In [18]:
untar_data(URLs.MNIST).ls()

(#2) [Path('/root/.fastai/data/mnist_png/testing'),Path('/root/.fastai/data/mnist_png/training')]

In [19]:
mnist_dls = get_img_loader(URLs.MNIST, 'training', 'testing', 28, 28, 128)
x_t, y_t = mnist_dls.train.one_batch()

In [20]:
x_t.shape, y_t.shape

(torch.Size([128, 3, 28, 28]), torch.Size([128]))

In [23]:
mnist_learn = get_learner(mnist_dls, rn, CrossEntropyLossFlat())

In [26]:
mnist_learn.lr_find()

RuntimeError: ignored