# Imports

In [1]:
import os
import time
import skimage
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from warmup_scheduler import GradualWarmupScheduler
from torch.utils.data.sampler import SubsetRandomSampler, RandomSampler, SequentialSampler
from efficientnet_pytorch import EfficientNet

# Configs

In [2]:
data_dir = '../data/'
df_biopsy = pd.read_csv(os.path.join(data_dir, 'train.csv'))
image_folder = os.path.join(data_dir, 'train_images_tiles_36_256x256')

kernel_type = 'efficientnet-b0-36_256x256'
enet_type = 'efficientnet-b0'
fold = 0
tile_size = 256
image_size = 256
n_tiles = 36
batch_size = 2
num_workers = 16
out_dim = 5
init_lr = 3e-4
warmup_factor = 10

warmup_epo = 1
n_epochs = 30

device = torch.device('cuda')

# K-fold CV

In [3]:
skf = StratifiedKFold(5, shuffle=True, random_state=42)
df_biopsy['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(skf.split(df_biopsy, df_biopsy['isup_grade'])):
    df_biopsy.loc[valid_idx, 'fold'] = i
df_biopsy.head()

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,fold
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0,4
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0,0
2,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,3
3,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4,4
4,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0,4


# Model

In [4]:
class enetv2(nn.Module):
    def __init__(self, enet_type, out_dim):
        super(enetv2, self).__init__()
        self.enet = model = EfficientNet.from_pretrained(enet_type, num_classes=out_dim)

    def forward(self, x):
        x = self.enet(x)
        return x

# Dataset

In [5]:
class PANDADataset(Dataset):
    def __init__(self,
                 df,
                 image_folder,
                 n_tiles,
                 transform=None,
                ):

        self.df = df.reset_index(drop=True)
        self.image_folder = image_folder
        self.n_tiles = n_tiles        
        self.transform = transform

    def __len__(self):
        return self.df.shape[0]
    
    def read_tiles(self, img_id):
        tiles = []
        for i in range(self.n_tiles):
            img_path = os.path.join(self.image_folder, \
                                    '{}_{}.jpeg'.format(img_id, i))
            tiles.append(skimage.io.imread(img_path))
        return tiles

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row.image_id        
        tiles = self.read_tiles(img_id)
        
        idxes = list(range(self.n_tiles))
        n_rows = int(np.sqrt(self.n_tiles))
        tiled_image = []
        for i in range(n_rows):
            tiled_image.append(np.concatenate(tiles[n_rows*i:n_rows*i \
                                                    + n_rows], axis=1))
        tiled_image = np.concatenate(tiled_image, axis=0)
        tiled_image = Image.fromarray(tiled_image)
        
        if self.transform is not None:
            tiled_image = self.transform(tiled_image)

        label = np.zeros(out_dim).astype(np.float32)
        label[:row.isup_grade] = 1.
        return tiled_image, torch.tensor(label)

# Transformations

In [6]:
mean = [0.90949707, 0.8188697, 0.87795304]
std = [0.36357649, 0.49984502, 0.40477625]

In [7]:
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

# Creating Model, Dataloader and optimizer

In [8]:
# using one fold for training
train_idx = np.where((df_biopsy['fold'] != fold))[0]
valid_idx = np.where((df_biopsy['fold'] == fold))[0]

df_train  = df_biopsy.loc[train_idx]
df_valid = df_biopsy.loc[valid_idx]

dataset_train = PANDADataset(df_train, image_folder, n_tiles, transform=transform_train)
dataset_valid = PANDADataset(df_valid, image_folder, n_tiles, transform=transform_val)

train_loader = DataLoader(dataset_train, 
                          batch_size=batch_size, 
                          sampler=RandomSampler(dataset_train),
                          num_workers=num_workers)
valid_loader = DataLoader(dataset_valid,
                          batch_size=batch_size,
                          sampler=SequentialSampler(dataset_valid),
                          num_workers=num_workers)

model = enetv2(enet_type, out_dim=out_dim)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=init_lr/warmup_factor)
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs-warmup_epo)
scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, \
                                   total_epoch=warmup_epo, after_scheduler=scheduler_cosine)

criterion = nn.BCEWithLogitsLoss()

print("Number of train samples : {}".format(len(dataset_train)))
print("Number of validation samples : {}".format(len(dataset_valid)))

Loaded pretrained weights for efficientnet-b0
Number of train samples : 8492
Number of validation samples : 2124


# Train and Val

In [9]:
def train_epoch(loader, optimizer):

    model.train()
    train_loss = []
    bar = tqdm(loader)
    for (data, target) in bar:
        
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        logits = model(data)
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()

        loss_np = loss.detach().cpu().numpy()
        train_loss.append(loss_np)
        smooth_loss = sum(train_loss[-100:]) / min(len(train_loss), 100)
        bar.set_description('loss: %.5f, smth: %.5f' % (loss_np, smooth_loss))
    return train_loss


def val_epoch(loader):

    model.eval()
    val_loss = []
    PREDS = []
    TARGETS = []

    with torch.no_grad():
        for (data, target) in tqdm(loader):
            data, target = data.to(device), target.to(device)
            logits = model(data)

            loss = criterion(logits, target)

            pred = logits.sigmoid().sum(1).detach().round()            
            PREDS.append(pred)
            TARGETS.append(target.sum(1))

            val_loss.append(loss.detach().cpu().numpy())
        val_loss = np.mean(val_loss)

    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    acc = (PREDS == TARGETS).mean() * 100.
    
    qwk = cohen_kappa_score(PREDS, TARGETS, weights='quadratic')
    qwk_k = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'karolinska'], \
                              df_valid[df_valid['data_provider'] == 'karolinska'].isup_grade.values,\
                              weights='quadratic')
    qwk_r = cohen_kappa_score(PREDS[df_valid['data_provider'] == 'radboud'], \
                              df_valid[df_valid['data_provider'] == 'radboud'].isup_grade.values,\
                              weights='quadratic')
    print('qwk', qwk, 'qwk_k', qwk_k, 'qwk_r', qwk_r)
        
    return val_loss, acc, qwk


In [10]:
best_model = '{}_fold-{}_best.pth'.format(kernel_type, fold)
final_model = '{}_fold-{}_final.pth'.format(kernel_type, fold) 
save_path = '../trained_models'

qwk_max = 0.
for epoch in range(1, n_epochs+1):
    print(time.ctime(), 'Epoch:', epoch)
    scheduler.step(epoch-1)

    train_loss = train_epoch(train_loader, optimizer)
    val_loss, acc, qwk = val_epoch(valid_loader)

    content = "{}, Epoch {}, lr: {:.7f}, train loss: {:.5f}," \
              " val loss: {:.5f}, acc: {:.5f}, qwk: {:.5f}".format(
                  time.ctime(), epoch, optimizer.param_groups[0]["lr"], 
                  np.mean(train_loss), np.mean(val_loss), acc, qwk
              )
    print(content)
    
    with open('log_{}_fold-{}.txt'.format(kernel_type, fold), 'a') as appender:
        appender.write(content + '\n')

    if qwk > qwk_max:
        print('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(qwk_max, qwk))
        torch.save(model.state_dict(), os.path.join(save_path, best_model))
        qwk_max = qwk

torch.save(model.state_dict(), os.path.join(save_path, final_model))

Sun Jun 14 02:40:05 2020 Epoch: 1




HBox(children=(FloatProgress(value=0.0, max=4246.0), HTML(value='')))




KeyboardInterrupt: 