In [1]:
# https://github.com/sdoria/SimpleSelfAttention/blob/master/Imagenette%20Simple%20Self%20Attention.ipynb

In [1]:
import warnings
from fastai.vision import *
from fastai.callbacks import *
warnings.filterwarnings('ignore')
path = Path('../data/imagenette/imagenette-160')
sys.path.append("dev/")

In [2]:
path = untar_data(URLs.IMAGEWOOF_160)

In [3]:
(path/'train').ls()

[PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02093754'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02088364'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02089973'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02087394'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02111889'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02099601'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02105641'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02086240'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02115641'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/models'),
 PosixPath('/home/ubuntu/.fastai/data/imagewoof-160/train/n02096294')]

In [4]:
tfms = get_transforms(True, 
                      False,
                      max_rotate=15,
                      max_zoom=1.3,
                      max_lighting=0.3,
                      max_warp=0.2,
                      p_affine=0.5,
                      p_lighting=0.5)

In [5]:
sz = 128
data = (ImageList.from_folder(path=path/'train')    
        .random_split_by_pct(0.1)
        .label_from_folder()
        .transform(tfms, size=sz)
        .databunch(bs=64)
        .normalize())

In [6]:
# test data
test_data = (ImageList.from_folder(path=path/'val')
            .no_split()
            .label_from_folder()
            .transform(None, size=sz)
            .databunch(bs=64)
            .normalize(data.stats))

In [7]:
data.add_test(test_data.train_ds.x)

In [8]:
data

ImageDataBunch;

Train: LabelList (11209 items)
x: ImageList
Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128)
y: CategoryList
n02093754,n02093754,n02093754,n02093754,n02093754
Path: /home/ubuntu/.fastai/data/imagewoof-160/train;

Valid: LabelList (1245 items)
x: ImageList
Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128)
y: CategoryList
n02087394,n02087394,n02111889,n02115641,n02105641
Path: /home/ubuntu/.fastai/data/imagewoof-160/train;

Test: LabelList (500 items)
x: ImageList
Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128)
y: EmptyLabelList
,,,,
Path: /home/ubuntu/.fastai/data/imagewoof-160/train

In [9]:
# data.show_batch()

### Baseline

In [10]:
model_name = 'resnet34'
arch = getattr(models, model_name)

learn_callbacks = [TerminateOnNaNCallback()]
learn_callback_fns = [partial(EarlyStoppingCallback, monitor='accuracy', mode='max', patience=5),
                      partial(SaveModelCallback, monitor='accuracy', mode='max',
                              name='baseline'),
                      partial(CSVLogger, filename=f'../logs/{model_name}')]

### AlphaPooling

In [11]:
x1 = torch.tensor(
    [
        [[1,2,3,4,5],
        [1,2,3,4,5]],
        
        [[1,2,3,4,5],
        [1,2,3,4,5]],
        
        [[1,2,3,4,5],
        [1,2,3,4,5]],
    ]
).float()

In [24]:
class AlphaPool(nn.Module):
    def __init__(self, alpha:float=1., eps:float=1e-8):
        super().__init__()
        self.alpha = nn.Parameter(tensor([0.]))   
        self.eps = eps
        
    def forward(self, x): 
        "Creates alpha-pooling features from a CNN like feature map"
        self.alpha.data.sigmoid_()
        b,fn,h,w = x.shape
        x = x.view(b,fn,h*w)
        x = (((((torch.sign(x)*(torch.abs(x)**(1))).permute(0,2,1).contiguous())
            .unsqueeze(2)) * x.permute(0,2,1).contiguous().unsqueeze(3))
            ).view(b,h*w,-1)
        x = F.normalize(x.mean(dim=1))
        return x

In [25]:
alpha_pool = AlphaPool()
x1 = torch.randn((32,512,4,4))
f = alpha_pool(x1); f.shape

torch.Size([32, 262144])

In [26]:
alpha_pool

AlphaPool()

In [27]:
f[0]

tensor([ 8.9807e-03,  4.3009e-03,  6.4197e-05,  ..., -1.9180e-03,
        -5.8776e-04,  8.3033e-03])

### Custom Head

In [28]:
from fastai.vision.learner import cnn_config

In [29]:
def create_head(nf:int, nc:int, lin_ftrs:Optional[Collection[int]]=None, ps:Floats=0.5,
                concat_pool:bool=True,alpha_pool:bool=True, bn_final:bool=False):
    "Model head that takes `nf` features, runs through `lin_ftrs`, and about `nc` classes."
    lin_ftrs = [nf, 512, nc] if lin_ftrs is None else [nf] + lin_ftrs + [nc]
    ps = listify(ps)
    if len(ps) == 1: ps = [ps[0]/2] * (len(lin_ftrs)-2) + ps
    actns = [nn.ReLU(inplace=True)] * (len(lin_ftrs)-2) + [None]
    pool = AdaptiveConcatPool2d() if concat_pool else nn.AdaptiveAvgPool2d(1)
    pool = AlphaPool() if alpha_pool else pool
    layers = [pool, Flatten()]
    for ni,no,p,actn in zip(lin_ftrs[:-1], lin_ftrs[1:], ps, actns):
        layers += bn_drop_lin(ni, no, True, p, actn)
    if bn_final: layers.append(nn.BatchNorm1d(lin_ftrs[-1], momentum=0.01))
    return nn.Sequential(*layers)

def create_cnn_model(base_arch:Callable, nc:int, cut:Union[int,Callable]=None, pretrained:bool=True,
        lin_ftrs:Optional[Collection[int]]=None, ps:Floats=0.5, custom_head:Optional[nn.Module]=None,
        split_on:Optional[SplitFuncOrIdxList]=None, bn_final:bool=False, concat_pool:bool=True):
    "Create custom convnet architecture"
    body = create_body(base_arch, pretrained, cut)
    if custom_head is None:
        nf = num_features_model(nn.Sequential(*body.children())) * (2 if concat_pool else 1)
        head = create_head(nf, nc, lin_ftrs, ps=ps, concat_pool=concat_pool, bn_final=bn_final)
    else: head = custom_head
    return nn.Sequential(body, head)

def cnn_learner(data:DataBunch, base_arch:Callable, cut:Union[int,Callable]=None, pretrained:bool=True,
                lin_ftrs:Optional[Collection[int]]=None, ps:Floats=0.5, custom_head:Optional[nn.Module]=None,
                split_on:Optional[SplitFuncOrIdxList]=None, bn_final:bool=False, init=nn.init.kaiming_normal_,
                concat_pool:bool=True, **kwargs:Any)->Learner:
    "Build convnet style learner."
    meta = cnn_config(base_arch)
    model = create_cnn_model(base_arch, data.c, cut, pretrained, lin_ftrs, ps=ps, custom_head=custom_head,
        split_on=split_on, bn_final=bn_final, concat_pool=concat_pool)
    learn = Learner(data, model, **kwargs)
    learn.split(split_on or meta['split'])
    if pretrained: learn.freeze()
    if init: apply_init(model[1], init)
    return learn

In [30]:
custom_head = create_head(512**2, data.c)

In [34]:
learn = cnn_learner(data=data, 
                    custom_head=custom_head,
                    base_arch=arch,
                    pretrained=False, 
                    metrics=[accuracy],
                    callbacks=learn_callbacks,
                    callback_fns=learn_callback_fns)

In [35]:
learn.model[1]

Sequential(
  (0): AlphaPool()
  (1): Flatten()
  (2): BatchNorm1d(262144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): Dropout(p=0.25)
  (4): Linear(in_features=262144, out_features=512, bias=True)
  (5): ReLU(inplace)
  (6): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): Dropout(p=0.5)
  (8): Linear(in_features=512, out_features=10, bias=True)
)

In [36]:
# learn.lr_find()
# learn.recorder.plot()

In [33]:
# learn.mixup(0.2)
# learn.to_fp16()
# learn.loss_func = LabelSmoothingCrossEntropy()

In [None]:
learn.fit_one_cycle(50, max_lr=1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.385319,2.081367,0.236948,01:27


Better model found at epoch 0 with accuracy value: 0.23694778978824615.


In [None]:
learn.callbacks = []
def TTA_score(load_pth='baseline'):
    learn.load(load_pth)
    preds = learn.TTA(ds_type=DatasetType.Test)
    test_preds = torch.argmax(preds[0], 1)
    test_preds = to_np(test_preds)
    test_labels = test_data.train_ds.y.items
    print(f"top1 acc: {np.mean(test_labels == test_preds)}")

def non_TTA_score(load_pth='baseline'):
    learn.load(load_pth)
    preds = learn.get_preds(ds_type=DatasetType.Test)
    test_preds = torch.argmax(preds[0], 1)
    test_preds = to_np(test_preds)
    test_labels = test_data.train_ds.y.items
    print(f"top1 acc: {np.mean(test_labels == test_preds)}")

In [None]:
non_TTA_score()

In [None]:
TTA_score()

### Bag of Tricks 

https://arxiv.org/pdf/1812.01187.pdf

#### 1. Large batch 

**Large Batch Size Training**

- Increase learning rate as lr x bs_new/bs_old

**Warmup with first m batches**

- Linearly increase learning rate to lr in first m batches 

**Set $\gamma$ = 0 in BN layers in ResBlocks**

- Mimics a network with less parameters at the beginning of training

**No wd (L2 reg) in bias, or BN params $\gamma, \beta$**

#### 2. Low Precision

**FP16 Training**

#### 3. ResNet Tweaks

ResNetB, ResNetC, ResNetD...

#### 4. Cosine Annealing LR

#### 5. Label Smoothing

LabelSmoothingCrossEntropy()

#### 6. Student Teacher

$(p,softmax(z)) + T^{2}(softmax(r/T),softmax(z/T))$

#### 7. Mixup Training