<a href="https://colab.research.google.com/github/tanish-g/SIIM-ISIC-MELANOMA/blob/master/gold_medal_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi -L

In [None]:
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()

# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm() 

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = "odetoinfinity" # username from the json file
os.environ['KAGGLE_KEY'] = "68cff7f51defb2eee5f4ba505815416c" # key from the json file

In [None]:
!mkdir /content/jpeg-melanoma-512x512/
%cd /content/jpeg-melanoma-512x512/
!kaggle datasets download -d cdeotte/jpeg-melanoma-512x512
!unzip -qq /content/jpeg-melanoma-512x512/jpeg-melanoma-512x512.zip
!rm -r /content/jpeg-melanoma-512x512/jpeg-melanoma-512x512.zip

In [None]:
%cd /content/
!mkdir /content/jpeg-isic2019-512x512/
%cd /content/jpeg-isic2019-512x512/
!kaggle datasets download -d cdeotte/jpeg-isic2019-512x512
!unzip -qq /content/jpeg-isic2019-512x512/jpeg-isic2019-512x512.zip
!rm -r /content/jpeg-isic2019-512x512/jpeg-isic2019-512x512.zip
%cd /content/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install efficientnet-pytorch --quiet

In [None]:
from pathlib import Path
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from PIL import Image
from torchvision import transforms as T
import torch.nn as nn
import torch
import torch.nn.functional as F
from sklearn.model_selection import GroupKFold
import numpy as np
from fastprogress.fastprogress import master_bar, progress_bar
from sklearn.metrics import accuracy_score, roc_auc_score
from efficientnet_pytorch import EfficientNet
from torchvision import models
import pdb
import albumentations as A
from albumentations.pytorch.transforms import ToTensor
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import gc
import pickle 

In [None]:
def list_files(path:Path):
    return [o for o in path.iterdir()]

In [None]:
train_df=pd.read_csv('x.csv')

In [None]:
tr2020=train_df[train_df.year==2020].reset_index(drop=True)
tr2019=train_df[train_df.year==2018].reset_index(drop=True)

In [None]:
for i in range(tr2020.shape[0]):
    tr2020.loc[i,'image_name']=os.path.join('/content/jpeg-melanoma-512x512/train/',tr2020.iloc[i].image_name+'.jpg')

In [None]:
for i in range(tr2019.shape[0]):
    tr2019.loc[i,'image_name']=os.path.join('/content/jpeg-isic2019-512x512/train/',tr2019.iloc[i].image_name+'.jpg')

In [None]:
tr_concat=pd.concat([tr2020,tr2019],join='inner',axis=0).reset_index(drop=True)
tr_concat

In [None]:
#tr_concat.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
test_df=pd.read_csv('/content/jpeg-melanoma-512x512/test.csv')
for i in range(test_df.shape[0]):
    test_df.loc[i,'image_name']=os.path.join('/content/jpeg-melanoma-512x512/test/',test_df.iloc[i].image_name+'.jpg')

# Data Exploration

In [None]:
path = Path('/content/jpeg-melanoma-512x512/')
df_path = Path('/content/jpeg-melanoma-512x512/')
im_sz = 512
bs = 16

In [None]:
train_fnames = list_files(path/'train')
df = pd.read_csv(df_path/'train.csv')
df.head()

The dataset contains around 35K images out of which only 584 images are malignant. That makes only 1.8% of the total dataset.

In [None]:
df.target.value_counts(),df.shape

In [None]:
print("Samples with Melanoma")
imgs = df[df.target==1]['image_name'].values
_, axs = plt.subplots(2, 5, figsize=(20, 8))
axs = axs.flatten()
for f_name,ax in zip(imgs[:10],axs):
    img = Image.open(path/f'train/{f_name}.jpg')
    ax.imshow(img)
    ax.axis('off')
plt.show()

print("Samples without Melanoma")
imgs = df[df.target==0]['image_name'].values
_, axs = plt.subplots(2, 5, figsize=(20, 8))
axs = axs.flatten()
for f_name,ax in zip(imgs[:10],axs):
    img = Image.open(path/f'train/{f_name}.jpg')
    ax.imshow(img)
    ax.axis('off')    
plt.show()


In [None]:
!git clone https://github.com/4uiiurz1/pytorch-auto-augment > /dev/null

In [None]:
import sys
sys.path.insert(0, './pytorch-auto-augment')
from auto_augment import AutoAugment, Cutout

In [None]:
#AUTO AUGMENT
def get_augmentations(p=0.5):
    imagenet_stats = {'mean':[0.485, 0.456, 0.406], 'std':[0.229, 0.224, 0.225]}
    train_tfms=transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        AutoAugment(),
        Cutout(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    ])
    test_tfms=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    ])
    return train_tfms, test_tfms

# Data Augmentation

We use albumentations to perform augmentations. Since the dataset is small and we are not using any external dataset in this Kernel, an increased augmentation can be helpful. You can play with the augmentation argument to either increase or decrease the amount of data augmentation applied.

In [None]:
def get_augmentations(p=0.5):
    imagenet_stats = {'mean':[0.485, 0.456, 0.406], 'std':[0.229, 0.224, 0.225]}
    train_tfms = A.Compose([
        A.RandomSizedCrop(min_max_height=(64,384),height=384,width=384,p=0.7),
        A.Resize(384,384,always_apply=1,p=1),
        A.Cutout(num_holes=8,max_h_size=64,max_w_size=64,p=p),
        A.IAAAffine(shear=2.0,rotate=180),
        A.HueSaturationValue(hue_shift_limit=0.2,sat_shift_limit=0.1,val_shift_limit=0.1),
        A.Flip(p=p),
        A.RandomRotate90(p=p),
        ToTensor(normalize=imagenet_stats)
        ])
    test_tfms = A.Compose([
        ToTensor(normalize=imagenet_stats)
        ])
    return train_tfms, test_tfms

# Train/Validation split
- We use a simple 80/20 split based on the triple stratified K-Fold split. 
- We remove all the duplicate images.
- TF record - id with values 12,13,14 are put into validation split and the rest into train split.

In [None]:
def get_train_val_split(df):
    #Remove Duplicates
    tr2018=df[df.year==2018].reset_index(drop=True)
    tr2020=df[df.year==2020].reset_index(drop=True)
    valid_df=tr2020[tr2020.kfold==4].reset_index(drop=True)
    train=tr2020[tr2020.kfold!=4].reset_index(drop=True)
    hold_df=tr2018[tr2018.kfold==0].reset_index(drop=True)
    tr2018=tr2018[tr2018.kfold!=0].reset_index(drop=True)
    train_df=pd.concat([train,tr2018],axis=0,join='inner')
    return train_df,valid_df,hold_df

# Dataset

In [None]:
class MelanomaDataset(Dataset):
    def __init__(self,df,im_path,transforms=None,is_test=False):
        self.df = df
        self.im_path = im_path
        self.transforms = transforms
        self.is_test = is_test
        
    def __getitem__(self,idx):
        img_path = self.df.iloc[idx]['image_name']  
        img = Image.open(img_path)
        if self.transforms:
            img = self.transforms(**{"image": np.array(img)})["image"]
            
        if self.is_test:
            return img
        target = self.df.iloc[idx]['target']
        return img,torch.tensor([target],dtype=torch.float32)
    
    def __len__(self):
        return self.df.shape[0]
        

In [None]:
#Auto Augment
class MelanomaDataset(Dataset):
    def __init__(self,df,im_path,transforms=None,is_test=False):
        self.df = df
        self.im_path = im_path
        self.transforms = transforms
        self.is_test = is_test
        
    def __getitem__(self,idx):
        img_path = self.df.iloc[idx]['image_name']
        img = Image.open(img_path)
        img=img.resize((256,256))
        if self.transforms:
#             img = self.transforms(**{"image": np.array(img)})["image"]
            img=self.transforms(img)
            
        if self.is_test:
            return img
        target = self.df.iloc[idx]['target']
        return img,torch.tensor([target],dtype=torch.float32)
    
    def __len__(self):
        return self.df.shape[0]

In [None]:
from albumentations.pytorch import ToTensor
from torchvision import transforms

In [None]:
train_tfms,test_tfms = get_augmentations(p=0.5)
#df = pd.read_csv(df_path/'train.csv')
train_df,valid_df = get_train_val_split(tr_concat)
train_ds = MelanomaDataset(df=train_df,im_path=path/'train',transforms=train_tfms)
train_dl = DataLoader(dataset=train_ds,batch_size=bs,shuffle=True,num_workers=4)
image,labels = next(iter(train_dl))

In [None]:
import torchvision
def show_img(img):
    plt.figure(figsize=(18,15))
    img = img / 4+0.5 
    npimg = img.numpy()
    npimg = np.clip(npimg, 0., 1.)
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

show_img(torchvision.utils.make_grid(image))


# Get a batch of training data


# Make a grid from batch

# Model 

Efficientnets have proved themselves in the last year as a key to winning competitions. You can try different Efficientnet model by passing the right model name. Changing the number in the model name `efficientnet-b0` gives you different models. The larger the number the more complex/bigger and better the model is. 

In [None]:
class MelanomaEfficientNet(nn.Module):
    def __init__(self,model_name='efficientnet-b3',pool_type=F.adaptive_avg_pool2d):
        super().__init__()
        self.pool_type = pool_type
        self.backbone = EfficientNet.from_pretrained(model_name)
        in_features = getattr(self.backbone,'_fc').in_features
        self.classifier = nn.Linear(in_features,1)
    def forward(self,x):
        features = self.pool_type(self.backbone.extract_features(x),1)
        features = features.view(x.size(0),-1)
        return self.classifier(features)

In [None]:
!pip install pytorch_ranger --quiet
from pytorch_ranger import Ranger

In [None]:
torch.optim.SGD()

# Helper functions 
- Split data to train and validation split
- Get model, choose different optimizer, freeze backbone, different learning rates/weight decay.
- The training method by default uses cosine annealing for scheduling learning rate, you can experiment with.

In [None]:
def get_device():
    return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def get_model(model_name='efficientnet-b3',lr=1e-3,wd=0.01,opt_fn=Ranger,device=None):
    device = device if device else get_device()
    model = MelanomaEfficientNet(model_name=model_name)
    opt = opt_fn(model.parameters(),lr=lr,weight_decay=wd)
    model = model.to(device)
    return model, opt

def training_step(xb,yb,model,loss_fn,opt,device,scheduler):
    xb,yb = xb.to(device), yb.to(device)
    out = model(xb)
    opt.zero_grad()
    loss = loss_fn(out,yb)
    loss.backward()
    opt.step()
    scheduler.step()
    return loss.item()
    
def validation_step(xb,yb,model,loss_fn,device):
    xb,yb = xb.to(device), yb.to(device)
    out = model(xb)
    loss = loss_fn(out,yb)
    out = torch.sigmoid(out)
    return loss.item(),out

def hold_step(xb,yb,model,loss_fn,device):
    xb,yb = xb.to(device), yb.to(device)
    out = model(xb)
    loss = loss_fn(out,yb)
    out = torch.sigmoid(out)
    return loss.item(),out

def get_data(train_df,valid_df,train_tfms,test_tfms,bs):
    train_ds = MelanomaDataset(df=train_df,im_path=path/'train',transforms=train_tfms)
    valid_ds = MelanomaDataset(df=valid_df,im_path=path/'train',transforms=test_tfms)
    hold_ds =  MelanomaDataset(df=hold_df,im_path=path/'train',transforms=test_tfms)
    train_dl = DataLoader(dataset=train_ds,batch_size=bs,shuffle=True,num_workers=4)
    valid_dl = DataLoader(dataset=valid_ds,batch_size=bs*2,shuffle=False,num_workers=4)
    hold_dl = DataLoader(dataset=hold_ds,batch_size=bs*2,shuffle=False,num_workers=4)
    return train_dl,valid_dl,hold_dl

In [None]:
def plot_loss_update(epoch, epochs, mb, train_loss, valid_loss):
    """ dynamically print the loss plot during the training/validation loop.
        expects epoch to start from 1.
    """
    x = range(1, epoch+1)
    y = np.concatenate((train_loss, valid_loss))
    graphs = [[x,train_loss], [x,valid_loss]]
    x_margin = 0.2
    y_margin = 0.05
    x_bounds = [1-x_margin, epochs+x_margin]
    y_bounds = [np.min(y)-y_margin, np.max(y)+y_margin]

    mb.update_graph(graphs, x_bounds, y_bounds)

In [None]:
#checkpoint=torch.load('/content/drive/My Drive/last_epochfold4.pth')

In [None]:
def fit(epochs,model,train_dl,valid_dl,opt,device=None,loss_fn=F.binary_cross_entropy_with_logits):
    device = device if device else get_device()
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, len(train_dl)*epochs)
    #checkpoint=torch.load('/content/drive/My Drive/effnet_b4_Ranger_fold1_last_epoch-further3.pth')
    #scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    #del checkpoint
    val_rocs = [] 
    #hold_rocs=[]
    best_roc=0
    mb = master_bar(range(9,epochs+9))
    for epoch in mb:    
        trn_loss,val_loss,hold_loss = 0.0,0.0,0.0
        val_preds = np.zeros((len(valid_dl.dataset),1))
        val_targs = np.zeros((len(valid_dl.dataset),1))
        #hold_preds = np.zeros((len(valid_dl.dataset),1))
        #hold_targs = np.zeros((len(valid_dl.dataset),1))
        #Training
        model.train()
      
        #For every batch 
        for xb,yb in progress_bar(train_dl,parent=mb):
            trn_loss += training_step(xb,yb,model,loss_fn,opt,device,scheduler)
            writer.add_scalar("Loss/train", trn_loss, epoch)
        trn_loss /= mb.child.total

        #Validation
        model.eval()
        with torch.no_grad():
            for i,(xb,yb) in enumerate(progress_bar(valid_dl,parent=mb)):
                loss,out = validation_step(xb,yb,model,loss_fn,device)
                val_loss += loss
                writer.add_scalar("Loss/val",val_loss, epoch)
                bs = xb.shape[0]
                val_preds[i*bs:i*bs+bs] = out.cpu().numpy()
                val_targs[i*bs:i*bs+bs] = yb.cpu().numpy()
        val_loss /= mb.child.total
        val_roc = roc_auc_score(val_targs.reshape(-1),val_preds.reshape(-1))
        val_rocs.append(val_roc)
        model.eval()
        with torch.no_grad():
            for i,(xb,yb) in enumerate(progress_bar(hold_dl,parent=mb)):
                loss,out = hold_step(xb,yb,model,loss_fn,device)
                hold_loss += loss
                writer.add_scalar("Loss/hold",hold_loss, epoch)
                bs = xb.shape[0]
                hold_preds[i*bs:i*bs+bs] = out.cpu().numpy()
                hold_targs[i*bs:i*bs+bs] = yb.cpu().numpy()
        hold_loss /= mb.child.total
        hold_roc = roc_auc_score(hold_targs.reshape(-1),hold_preds.reshape(-1))
        hold_rocs.append(hold_roc)
        if val_roc>best_roc:
          best_roc=val_roc
          torch.save(model.state_dict(),f'/content/drive/My Drive/effb4_best_auc_Ranger_fold4-further3.pth')
        torch.save({'model_state_dict':model.state_dict(),'scheduler_state_dict':scheduler.state_dict(),'optimizer_state_dict':opt.state_dict(),'val_loss':val_loss,'val_roc':val_roc},f'/content/drive/My Drive/effnet_b4_Ranger_fold4_last_epoch-further3.pth')
        print(f'Epoch: {epoch},Train_loss: {trn_loss:.5f},Val_roc:{val_roc:.4f},Val_loss:{val_loss:.5f},lr:{get_lr(opt)}')
        with open('/content/drive/My Drive/effnetb4/log_fold4.txt','a+') as f:
          f.writelines(f'Epoch: {epoch},Train_loss: {trn_loss:.5f},Val_roc:{val_roc:.4f},Hold_ROC:{hold_roc:.4f},Val_loss:{val_loss:.5f},lr:{get_lr(opt)}\n')
    writer.flush()
    return model

In [None]:
train_df,valid_df,hold_df= get_train_val_split(tr_concat)
train_tfms,test_tfms = get_augmentations(p=0.6)
train_dl,valid_dl,hold_dl = get_data(train_df,valid_df,train_tfms,test_tfms,bs)
model,opt = get_model(model_name='efficientnet-b4',lr=2e-5,wd=1e-4)

In [None]:
#model.load_state_dict(checkpoint['model_state_dict'])
#opt.load_state_dict(checkpoint['optimizer_state_dict'])
#del checkpoint

In [None]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

I trained the model on my local machine. You can uncomment below code to start training here.

In [None]:
model,val_rocs = fit(35,model,train_dl,valid_dl,opt)

In [None]:
%load_ext tensorboard

# Generate test predictions

- By default we use the same data augmentation techniques that we applied during training. 
- Tweak the TTA parameter in `get_preds()` to increase the number of times TTA is applied.
- If you do not want TTA, change `transforms` to `test_transforms`.


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
imagenet_stats = {'mean':[0.485, 0.456, 0.406], 'std':[0.229, 0.224, 0.225]}
test_tfms1 = A.Compose([ 
    A.Resize(512,512),               
    ToTensor(normalize=imagenet_stats)
    ])

In [None]:
#test_df = pd.read_csv(path/'test.csv')

model, opt = get_model(model_name='efficientnet-b4',lr=1e-4,wd=1e-4)
model.load_state_dict(torch.load(f'/content/drive/My driveeffb4_best_auc_Ranger_fold4.pth',map_location=device))

#Testing with lighter augmentation
test_ds = MelanomaDataset(df=test_df,im_path='test',transforms=test_tfms1,is_test=True)
test_dl = DataLoader(dataset=test_ds,batch_size=32,shuffle=False,num_workers=4)

In [None]:
test_ds = MelanomaDataset(df=test_df,im_path='test',transforms=test_tfms1,is_test=True)
test_dl = DataLoader(dataset=test_ds,batch_size=32,shuffle=False,num_workers=4)

In [None]:
valid_ds = MelanomaDataset(df=valid_df,im_path='train',transforms=test_tfms1,is_test=False)
valid_dl = DataLoader(dataset=valid_ds,batch_size=32,shuffle=False,num_workers=4)

In [None]:
model.train(False)
valid_preds2=[]
tk = tqdm(valid_dl, total=len(valid_dl), position=0, leave=True)
for i,(inputs,labels) in enumerate(tk):
  #labels=labels.type(torch.float32)
  inputs=inputs.to(device)
  labels=labels.to(device)
  model.eval()
  with torch.no_grad():
        outputs=model(inputs)
        pred = torch.sigmoid(outputs)
  valid_preds2.extend(pred.detach().cpu().numpy())
  del outputs
  del pred
  gc.collect

In [None]:
roc_auc_score(valid_df.target,valid_preds2)

In [None]:
model.train(False)
valid_preds2=[]
tk = tqdm(test_dl, total=len(test_dl), position=0, leave=True)
for i,(inputs) in enumerate(tk):
  #labels=labels.type(torch.float32)
  inputs=inputs.to(device)
  #labels=labels.to(device)
  model.eval()
  with torch.no_grad():
        outputs=model(inputs)
        pred = torch.sigmoid(outputs)
  valid_preds2.extend(pred.detach().cpu().numpy())
  del outputs
  del pred
  gc.collect

In [None]:
#Testing for Test Time Augmentation
def get_preds(model,device=None,tta=3):
    if device is None:
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    preds = np.zeros(len(test_ds))
    for tta_id in range(tta):
        test_preds = []
        with torch.no_grad():
            for xb in test_dl:
                xb = xb.to(device)
                out = model(xb)
                out = torch.sigmoid(out)
                test_preds.extend(out.cpu().numpy())
            preds += np.array(test_preds).reshape(-1)
        print(f'TTA {tta_id}')
    preds /= tta
    return preds

#Changing tta to 25 from 10
preds = get_preds(model,tta=1)  

In [None]:
subm = pd.read_csv('/content/jpeg-melanoma-512x512/sample_submission.csv')
subm.target = pd.DataFrame(valid_preds2)
subm.to_csv('submission_b4_384-fold4-final.csv',index=False)

In [None]:
subm