In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path

from fastai import *
from fastai.vision import *

import json

from utils import *
import ast

In [2]:
NUM_SAMPLES_PER_CLASS = 70_000
NUM_VAL = 50 * 340

In [3]:
PATH = Path('../data/quickdraw/')

In [4]:
PATH.ls()

[PosixPath('../data/quickdraw/test_simplified.csv'),
 PosixPath('../data/quickdraw/train'),
 PosixPath('../data/quickdraw/submission'),
 PosixPath('../data/quickdraw/train_folders'),
 PosixPath('../data/quickdraw/dummy_train_folders'),
 PosixPath('../data/quickdraw/dfs_combined.csv'),
 PosixPath('../data/quickdraw/classes.pkl'),
 PosixPath('../data/quickdraw/test')]

### data check

In [5]:
train_folders = (PATH/"train_folders").ls()

In [6]:
len(np.random.choice(train_folders).ls())

70000

### Create ImageDataBunch using the data block API

In [7]:
bs = 100
sz = 256

In [8]:
def create_func(item):
    with open(item) as f: item = f.read()
    img = list2drawing(json.loads(item)['data'], size=sz, lw=4, time_color=True)
    img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    tensor = torch.from_numpy(img).float()
    return Image(tensor.permute((2,0,1)).div_(255))

In [9]:
item_list = ItemList.from_folder(PATH/"dummy_train_folders", create_func=create_func)
np.random.seed(42)
idxs = np.arange(item_list.items.shape[0])
np.random.shuffle(idxs)
val_idxs = idxs[:NUM_VAL]
item_lists = item_list.split_by_idx(val_idxs)
label_lists = item_lists.label_from_folder()
test_items = ItemList.from_folder(PATH/"test", create_func=create_func)
label_lists.add_test(test_items);

In [10]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

class RandomSamplerWithEpochSize(Sampler):
    """Yields epochs of specified sizes. Iterates over all examples in a data_source in random
    order. Ensures (nearly) all examples have been trained on before beginning the next iteration
    over the data_source - drops the last epoch that would likely be smaller than epoch_size.
    """
    def __init__(self, data_source, epoch_size):
        self.n = len(data_source)
        self.epoch_size = epoch_size
        self._epochs = []
    def __iter__(self):
        return iter(self.next_epoch)
    @property
    def next_epoch(self):
        if len(self._epochs) == 0: self.generate_epochs()
        return self._epochs.pop()
    def generate_epochs(self):
        idxs = [i for i in range(self.n)]
        np.random.shuffle(idxs)
        self._epochs = list(chunks(idxs, self.epoch_size))[:-1]
    def __len__(self):
        return self.epoch_size

In [11]:
tfms = get_transforms(do_flip=True, flip_vert=False, 
                      max_rotate=10, max_zoom=0, max_lighting=None, max_warp=None)

In [12]:
70000*340/1000000

23.8

In [13]:
train_dl = DataLoader(
    label_lists.train,
    num_workers=8,
    batch_sampler=BatchSampler(RandomSamplerWithEpochSize(label_lists.train, 1_000_000), bs, True)
)
valid_dl = DataLoader(label_lists.valid, bs, False, num_workers=8)
test_dl = DataLoader(label_lists.test, bs, False, num_workers=8)

data_bunch = ImageDataBunch(train_dl, valid_dl, test_dl, tfms=tfms)

In [14]:
data_bunch.tfms

[[RandTransform(tfm=TfmCrop (crop_pad), kwargs={'row_pct': (0, 1), 'col_pct': (0, 1)}, p=1.0, resolved={}, do_run=True, is_random=True),
  RandTransform(tfm=TfmAffine (flip_affine), kwargs={}, p=0.5, resolved={}, do_run=True, is_random=True),
  RandTransform(tfm=TfmAffine (rotate), kwargs={'degrees': (-10, 10)}, p=0.75, resolved={}, do_run=True, is_random=True)],
 [RandTransform(tfm=TfmCrop (crop_pad), kwargs={}, p=1.0, resolved={}, do_run=True, is_random=True)]]

In [15]:
#classes = data_bunch.classes
#pd.to_pickle(classes, PATH/"classes.pkl")

In [16]:
classes = pd.read_pickle(PATH/"classes.pkl")

In [17]:
classes[:10]

['horse',
 'flamingo',
 'wheel',
 'spreadsheet',
 'sun',
 'mosquito',
 'train',
 'pig',
 'hot_air_balloon',
 'tiger']

### Model

In [18]:
from fastai.callbacks import SaveModelCallback, EarlyStoppingCallback
import sys

In [19]:
# sys.path.append("pytorch-mobilenet-v2/")
# from MobileNetV2 import MobileNetV2

In [20]:
sys.path.append("./senet.pytorch/")
from se_resnet import se_resnet50
model = se_resnet50(340)
learn = Learner(data_bunch, model, metrics=[accuracy, map3],
                callback_fns=[partial(SaveModelCallback, every="epoch", name="senet")])

### Train

In [None]:
learn.lr_find(end_lr=1e-1)

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(48, max_lr=5e-3)

In [None]:
learn.validate(data_bunch.valid_dl)

In [None]:
plt.plot([l.item() for l in learn.recorder.losses])

In [None]:
name = 'mobilenet'

In [None]:
learn.save(f'{name}-stage-1')

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(6, max_lr=6e-4)

In [None]:
learn.save(f'{name}-stage-2')

### Predict

In [21]:
learn.data.test_dl

DeviceDataLoader(dl=<torch.utils.data.dataloader.DataLoader object at 0x7f7b046602e8>, device=device(type='cuda'), tfms=[[RandTransform(tfm=TfmCrop (crop_pad), kwargs={'row_pct': (0, 1), 'col_pct': (0, 1)}, p=1.0, resolved={}, do_run=True, is_random=True), RandTransform(tfm=TfmAffine (flip_affine), kwargs={}, p=0.5, resolved={}, do_run=True, is_random=True), RandTransform(tfm=TfmAffine (rotate), kwargs={'degrees': (-10, 10)}, p=0.75, resolved={}, do_run=True, is_random=True)], [RandTransform(tfm=TfmCrop (crop_pad), kwargs={}, p=1.0, resolved={}, do_run=True, is_random=True)]], collate_fn=<function data_collate at 0x7f7b2c749620>, skip_size1=False)

In [22]:
from utils import create_submission

In [23]:
sorted(Path('../quickdraw/models').ls())[:5]

[PosixPath('../quickdraw/models/final-mobilenet-stage-1.pth'),
 PosixPath('../quickdraw/models/final-mobilenet-stage-2.pth'),
 PosixPath('../quickdraw/models/final-senet-stage-1.pth'),
 PosixPath('../quickdraw/models/senet_1.pth'),
 PosixPath('../quickdraw/models/senet_10.pth')]

In [71]:
model_ckpt = "senet_19"

In [72]:
learn = learn.load(model_ckpt)

In [73]:
learn.data.test_dl.tfms = None

In [74]:
preds, _ = learn.get_preds(DatasetType.Test)

In [75]:
sub_name = f"{model_ckpt}"

In [76]:
def top_3_preds(preds): return np.argsort(preds.numpy())[:, ::-1][:, :3]

def top_3_pred_labels(preds, classes):
    top_3 = top_3_preds(preds)
    labels = []
    for i in range(top_3.shape[0]):
        labels.append(' '.join([classes[idx].replace(" ", "_") for idx in top_3[i]]))
    return labels
    
def create_submission(test_preds, test_dl, name, classes):
    key_ids = [path.stem for path in test_dl.dataset.x.items]
    labels = top_3_pred_labels(test_preds, classes)
    sub = pd.DataFrame({'key_id': key_ids, 'word': labels})
    sub.to_csv(f'{PATH}/submission/{name}.csv.gz', index=False, compression='gzip')

In [77]:
create_submission(preds, test_dl, sub_name, classes)

In [78]:
sub_name

'senet_19'

In [79]:
len(test_items)

112199

In [80]:
pd.read_csv(f'../data/quickdraw/submission/{sub_name}.csv.gz').head(10)

Unnamed: 0,key_id,word
0,9736539586575506,horse frog bear
1,9156420504855747,skull mushroom bread
2,9725934544390889,sink flamingo mailbox
3,9903565523633750,trombone trumpet megaphone
4,9507032565495870,tennis_racquet fan microphone
5,9035256546065251,flower tree broccoli
6,9494182684488699,snowman bear frog
7,9280101504435248,camel horse dragon
8,9955106400063151,necklace goatee parachute
9,9912795502728975,cactus mosquito dragon


In [81]:
!kaggle competitions submit -c quickdraw-doodle-recognition -f '../data/quickdraw/submission/{sub_name}.csv.gz' -m "{sub_name}"

100%|██████████████████████████████████████| 1.53M/1.53M [00:01<00:00, 1.47MB/s]
Successfully submitted to Quick, Draw! Doodle Recognition Challenge