In [1]:
from fastai.vision.all import *
from self_supervised.layers import *

In [2]:
datapath = Path("stanford-dogs-dataset/")

In [3]:
train_df = pd.read_csv(datapath/'train.csv')
test_df = pd.read_csv(datapath/'test.csv')
sample_df = pd.read_csv(datapath/'sample_train.csv')

In [4]:
train_df.shape, test_df.shape, sample_df.shape

((12000, 2), (8580, 2), (6000, 3))

In [5]:
train_df.head()

Unnamed: 0,filenames,labels
0,n02085620-Chihuahua/n02085620_5927.jpg,n02085620-Chihuahua
1,n02085620-Chihuahua/n02085620_4441.jpg,n02085620-Chihuahua
2,n02085620-Chihuahua/n02085620_1502.jpg,n02085620-Chihuahua
3,n02085620-Chihuahua/n02085620_1916.jpg,n02085620-Chihuahua
4,n02085620-Chihuahua/n02085620_13151.jpg,n02085620-Chihuahua


In [6]:
test_df.head()

Unnamed: 0,filenames,labels
0,n02085620-Chihuahua/n02085620_2650.jpg,n02085620-Chihuahua
1,n02085620-Chihuahua/n02085620_4919.jpg,n02085620-Chihuahua
2,n02085620-Chihuahua/n02085620_1765.jpg,n02085620-Chihuahua
3,n02085620-Chihuahua/n02085620_3006.jpg,n02085620-Chihuahua
4,n02085620-Chihuahua/n02085620_1492.jpg,n02085620-Chihuahua


In [7]:
sample_df.head()

Unnamed: 0,label,filename,split
0,n02085620-Chihuahua,n02085620-Chihuahua/n02085620_7292.jpg,valid
1,n02085620-Chihuahua,n02085620-Chihuahua/n02085620_500.jpg,valid
2,n02085620-Chihuahua,n02085620-Chihuahua/n02085620_1298.jpg,valid
3,n02085620-Chihuahua,n02085620-Chihuahua/n02085620_4290.jpg,valid
4,n02085620-Chihuahua,n02085620-Chihuahua/n02085620_4998.jpg,valid


### Dataset

In [9]:
def read_image(filename): return PILImage.create(datapath/'images/Images'/filename)
def read_image_size(filename): return PILImage.create(datapath/'images/Images'/filename).shape

In [12]:
# # sizes = parallel(read_image_size, filenames, progress=True)
# # Counter(sizes).most_common(10)
# [((375, 500), 1497),
#  ((333, 500), 602),
#  ((500, 375), 361),
#  ((500, 333), 196),
#  ((334, 500), 110),
#  ((332, 500), 78),
#  ((500, 500), 37),
#  ((500, 332), 37),
#  ((374, 500), 37),
#  ((357, 500), 34)]

In [13]:
# filenames = sample_df['filename'].values
# labels = sample_df['label'].values
# fn2label = dict(zip(filenames, labels))

filenames = train_df['filenames'].values
labels = train_df['labels'].values
fn2label = dict(zip(filenames, labels))

In [14]:
def read_label(filename): return fn2label[filename]

In [18]:
# train_idxs,valid_idxs = [],[]
# for idx,s in enumerate(train_df['split']):
#     if s == 'valid': valid_idxs.append(idx)
#     if s == 'train': train_idxs.append(idx)    

# len(train_idxs), len(valid_idxs)

In [19]:
size,bs = 384,32

tfms = [[read_image, ToTensor, RandomResizedCrop(size, min_scale=.75)], 
        [read_label, Categorize()]]

dsets = Datasets(filenames, tfms=tfms, 
                 splits=None)
#                  splits=[train_idxs, valid_idxs])

batch_augs = aug_transforms()
# batch_augs = []

stats = imagenet_stats

batch_tfms = [IntToFloatTensor] + batch_augs + [Normalize.from_stats(*stats)]
dls = dsets.dataloaders(bs=bs, after_batch=batch_tfms)

In [20]:
len(dls.train_ds), len(dls.valid_ds)

(12000, 0)

In [21]:
# dls.show_batch()

### Learner

In [22]:
from torch.utils.checkpoint import checkpoint_sequential
    
class CheckpointVisionTransformer(Module):
    def __init__(self, vit_model, checkpoint_nchunks=2):
        self.checkpoint_nchunks = checkpoint_nchunks
        self.vit_model = vit_model
    
    def forward_features(self, x):
        B = x.shape[0]
        x = self.vit_model.patch_embed(x)

        cls_tokens = self.vit_model.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.vit_model.pos_embed
        x = self.vit_model.pos_drop(x)
        x = checkpoint_sequential(self.vit_model.blocks, self.checkpoint_nchunks, x)
        x = self.vit_model.norm(x)[:, 0]
        x = self.vit_model.pre_logits(x)
        return x

    def forward(self, x):
        x = self.forward_features(x)
        x = self.vit_model.head(x)
        return x

In [23]:
arch = "vit_base_patch16_384"
encoder = create_encoder(arch, pretrained=True, n_in=3)
encoder = CheckpointVisionTransformer(encoder, 2)
with torch.no_grad(): nf = encoder(torch.randn(2,3,size,size)).size(-1)
classifier = create_cls_module(nf, dls.c, lin_ftrs=[768], use_bn=False, first_bn=False, ps=0.)
model = nn.Sequential(encoder, classifier)



In [24]:
classifier

Sequential(
  (0): Linear(in_features=768, out_features=768, bias=True)
  (1): ReLU(inplace=True)
  (2): Linear(in_features=768, out_features=120, bias=True)
)

In [25]:
def model_splitter(m): return L(m[0], m[1]).map(params)

In [26]:
cbs = []
# if WANDB: cbs += [WandbCallback(log_preds=False,log_model=False)]
learn = Learner(dls, model, opt_func=ranger, cbs=cbs, metrics=[accuracy], splitter=model_splitter,
                loss_func=LabelSmoothingCrossEntropyFlat(0.1))
learn.to_fp16();

In [27]:
# learn.lr_find()

In [28]:
lr = 3e-3
learn.freeze()
learn.fit_one_cycle(2, lr_max=(lr), pct_start=0.5)

epoch,train_loss,valid_loss,accuracy,time
0,1.355932,,,02:12
1,1.114322,,,02:12


  warn("Your generator is empty.")


In [29]:
lr /= 3 
learn.unfreeze()
learn.fit_one_cycle(2, lr_max=slice(lr/100, lr), pct_start=0.5)

epoch,train_loss,valid_loss,accuracy,time
0,1.023887,,,04:03
1,0.963057,,,04:04


### Evaluate on Test

In [30]:
test_filenames = test_df['filenames'].values
test_labels = test_df['labels'].values

In [31]:
test_dl = dls.test_dl(test_filenames)

In [32]:
preds = learn.get_preds(dl=test_dl)

In [33]:
pred_labels = torch.argmax(preds[0],dim=-1)

In [34]:
pred_labels = array(dls.vocab)[pred_labels]

In [35]:
np.mean(test_labels == pred_labels)

0.9271561771561772