https://camelyon16.grand-challenge.org/Data

In [7]:
!jupyter nbconvert --to script './kfold_end_to_end.ipynb'

[NbConvertApp] Converting notebook ./kfold_end_to_end.ipynb to script
[NbConvertApp] Writing 6844 bytes to ./kfold_end_to_end.py


In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
from fastai.vision import *
sys.path.append("dev/")

In [11]:
path = Path('../data/histopathologic/')

In [12]:
path.ls()

[PosixPath('../data/histopathologic/train'),
 PosixPath('../data/histopathologic/sample_submission.csv'),
 PosixPath('../data/histopathologic/preds'),
 PosixPath('../data/histopathologic/submissions'),
 PosixPath('../data/histopathologic/train_labels.csv'),
 PosixPath('../data/histopathologic/logs'),
 PosixPath('../data/histopathologic/models'),
 PosixPath('../data/histopathologic/test')]

In [13]:
len((path/'train').ls()), len((path/'test').ls())

(220025, 57458)

### Overview

Main folders : `{model_name}/models`, `{model_name}/preds`, `{model_name}/submissions`

### cv data

In [7]:
# from sklearn.model_selection import KFold
# kfold = KFold(5, random_state=42)
# n_train = len((path/'train').ls())
# cv_idxs = kfold.split(X=range(n_train))
# df = pd.read_csv(path/'train_labels.csv')

In [8]:
# def get_data(trn_idx, val_idx):
#     tfms = get_transforms(True, True, max_rotate=None, max_zoom=1., max_lighting=None,
#                       max_warp=0, p_affine=0.5, p_lighting=0.5, xtra_tfms=[])
#     data = (ImageList.from_df(df=df, path=path, folder='train', suffix='.tif')
#         .split_by_idx(valid_idx=val_idx)
#         .label_from_lists(train_labels=list(df['label'].iloc[trn_idx].values),
#                                         valid_labels=list(df['label'].iloc[val_idx].values))
#         .transform(tfms)
#         .databunch()
#         .normalize(imagenet_stats))
#     data.add_test(ImageList.from_folder(path/'test'))
#     return data

In [15]:
# cv_data = [get_data(*idxs) for idxs in cv_idxs]
# pd.to_pickle(cv_data, path/'models/cv_data.pkl')

### kfold training

In [14]:
from metric_utils import AUC
from fastai.callbacks import *

In [15]:
cv_data = pd.read_pickle(path/'models/cv_data.pkl')
arch = models.densenet169
model_name = arch.__name__ 
print(f"Starting Training with model: {model_name}")

Starting Training with model: densenet169


In [12]:
# Train with kfold models
for i in range(len(cv_data)):
    fold_num = i
    fold_data = cv_data[i]
    
    # Initialize Learner
    print(f"Initialize Learner at fold{fold_num}")
    auc = AUC()
    os.makedirs(path/f'models/best_of_{model_name}', exist_ok=True)
    learn_callbacks = [TerminateOnNaNCallback()]
    learn_callback_fns = [partial(EarlyStoppingCallback, monitor='auc', mode='max', patience=2),
                          partial(SaveModelCallback, monitor='auc', mode='max', every='improvement',
                                  name=f'best_of_{model_name}/fold{fold_num}'),
                          partial(CSVLogger, filename=f'logs/{model_name}')]
    learn = cnn_learner(data=fold_data, base_arch=arch, metrics=[accuracy, auc], callbacks=learn_callbacks,
                   callback_fns=learn_callback_fns)
    
    # Stage-1 training
    print("Stage-1")
    learn.lr_find()
    try:
        learn.recorder.plot(suggestion=True, k=5)
    except: 
         learn.recorder.plot(suggestion=True)
    max_lr = learn.recorder.min_grad_lr
    print(f"Stage-1 training with lr={max_lr}")
    learn.fit_one_cycle(50, max_lr=max_lr)
    
    # Stage-2 training
    print("Stage-2")
    learn.freeze_to(1)
    learn.lr_find()
    try:
        learn.recorder.plot(suggestion=True, k=5)
    except: 
         learn.recorder.plot(suggestion=True)
    max_lr = learn.recorder.min_grad_lr
    print(f"Stage-2 training with lr={max_lr}")
    learn.fit_one_cycle(20, max_lr=[max_lr/10, max_lr/3, max_lr])
    
    # Stage-3 training
    print("Stage-3")
    learn.unfreeze()
    learn.lr_find()
    try:
        learn.recorder.plot(suggestion=True, k=5)
    except: 
         learn.recorder.plot(suggestion=True)
    max_lr = learn.recorder.min_grad_lr
    print(f"Stage-3 training with lr={max_lr}")
    learn.fit_one_cycle(5, max_lr=[max_lr/10, max_lr/3, max_lr])
    
    print(f"Training of fold{fold_num} model is done...destroying learner")
    learn.destroy()

0
1
2
3
4


### get_preds

In [None]:
# create preds and submissions directory for the model
os.makedirs(path/f'preds/best_of_{model_name}', exist_ok=True)
os.makedirs(path/f"submissions/best_of_{model_name}", exist_ok=True)

for i in range(5):
    # disable TerminateOnNaNCallback for get_preds to work
    learn.callbacks = [cb for cb in learn.callbacks if
                       cb.__class__ == TerminateOnNaNCallback.__class__]

    # load fold model
    load_fold_num = i
    learn.load(f'best_of_{model_name}/fold{load_fold_num}');

    # get preds
    test_preds, _ = learn.get_preds(ds_type=DatasetType.Test)

    # save preds as pickle file
    pd.to_pickle(test_preds, path/f'preds/best_of_{model_name}/fold{load_fold_num}_preds.pkl')
    

### get_preds with TTA

In [47]:
for i in range(5):
    # disable TerminateOnNaNCallback for get_preds to work
    learn.callbacks = [cb for cb in learn.callbacks if
                       cb.__class__ == TerminateOnNaNCallback.__class__]

    # load fold model
    load_fold_num = i
    learn.load(f'best_of_{model_name}/fold{load_fold_num}');

    # get preds
    tta_preds,_  = learn.TTA(ds_type=DatasetType.Test)

    # save preds as pickle file
    pd.to_pickle(tta_preds, path/f'preds/best_of_{model_name}/fold{load_fold_num}_TTA_preds.pkl')
    

### create submission

In [None]:
sample_submission = pd.read_csv(path/'sample_submission.csv')
test_names = [o.name.split('.')[0] for o in learn.data.test_ds.items]

In [23]:
pred_fnames = (path/f'preds/best_of_{model_name}').ls()

In [61]:
TTA_fnames = [o for o in pred_fnames if "TTA" in str(o)]
non_TTA_fnames = [o for o in pred_fnames if "TTA" not in str(o)]

In [62]:
TTA_fnames

[PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold2_TTA_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold3_TTA_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold1_TTA_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold0_TTA_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold4_TTA_preds.pkl')]

In [63]:
non_TTA_fnames

[PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold0_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold1_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold4_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold3_preds.pkl'),
 PosixPath('../data/histopathologic/preds/best_of_vgg16_bn/fold2_preds.pkl')]

In [71]:
# average TTA_preds
avg_TTA_labels = np.mean([pd.read_pickle(fn).numpy() for fn in TTA_fnames], axis=0)[:, 1]
avg_non_TTA_labels = np.mean([pd.read_pickle(fn).numpy() for fn in non_TTA_fnames], axis=0)[:, 1]

In [76]:
# create submission file
avg_TTA_submission = sample_submission.copy()
avg_non_TTA_submission = sample_submission.copy()

avg_TTA_submission['label'] = sample_submission['id'].map(dict(zip(test_names, avg_TTA_labels)))
avg_non_TTA_submission['label'] = sample_submission['id'].map(dict(zip(test_names, avg_non_TTA_labels)))

In [78]:
# save submissions
avg_TTA_path = path/f"submissions/best_of_{model_name}/{model_name}_avg_TTA.csv"
avg_non_TTA_path = path/f"submissions/best_of_{model_name}/{model_name}_avg_non_TTA.csv"
avg_TTA_submission.to_csv(avg_TTA_path, index=False)
avg_non_TTA_submission.to_csv(avg_non_TTA_path, index=False)

In [79]:
avg_TTA_path

PosixPath('../data/histopathologic/submissions/best_of_vgg16_bn/vgg16_bn_avg_TTA.csv')

In [None]:
# submit to Kaggle
!kaggle competitions submit -c histopathologic-cancer-detection -f {avg_TTA_path} -m f"{model_name}_avg_TTA"
!kaggle competitions submit -c histopathologic-cancer-detection -f {avg_non_TTA_path} -m f"{model_name}_avg_non_TTA"