In [1]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
!ls 'drive/My Drive/TB_zindi/inputs'

In [None]:
# install this package in order to use ImageNet pretrained models
!pip install pretrainedmodels

In [None]:
# albumentation for data augmentation
!pip install albumentations

In [None]:
# main libraries and seed
# please don't forget to change the dataset path on your own workstation
import sys

sys.path.insert(0,'drive/My Drive/TB_zindi/')


import torch
import numpy as np
import pandas as pd
import os
from PIL import Image
import albumentations
from torch.utils.data import Dataset
import torch.nn as nn
import pretrainedmodels
from torch.nn import functional as F
from PIL import ImageFile
from PIL import Image
from sklearn import metrics
from torch.cuda import amp
from source.utils import TB_Dataset, Engine, resnet152, TB_Test_Dataset,TB_Dataset_lr
import random

ImageFile.LOAD_TRUNCATED_IMAGES = True

torch.backends.cudnn.benchmark = False


def set_all_seeds(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True

set_all_seeds(1234) 


In [None]:

DEVICE = "cuda"
EPOCHS = 30
batch_size = 16
HEIGHT = 224
WIDTH = 224

def train(fold,save_model=False):
     
    df = pd.read_csv('drive/My Drive/TB_zindi/inputs/TB_stratified_kfold.csv')
    df_train = df[df.kfold!=fold].reset_index(drop=True)

    df_val = df[df.kfold==fold].reset_index(drop=True)

    x_train = df_train.loc[:,["ID"]]
    y_train = df_train.loc[:,["LABEL"]].values

    print(x_train.shape)
    print(y_train.shape)
    x_val = df_val.loc[:,["ID"]]
    y_val = df_val.loc[:,["LABEL"]].values
    
    print(x_val.shape)
    print(y_val.shape)

    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    
    train_aug =albumentations.Compose([  
            albumentations.CLAHE(p=0.002),                                                                                                       
            albumentations.Resize(224,224,always_apply=True),
            albumentations.Normalize(mean,std,max_pixel_value=255.0,always_apply=True),
            albumentations.CenterCrop(height=224,width=224),
            albumentations.RandomBrightness(p=0.002),
            albumentations.HorizontalFlip(p=0.002),
            albumentations.Cutout(p=0.002),
            albumentations.Rotate(limit=30,p=0.002),           
        ])


    val_aug = albumentations.Compose([
            albumentations.Resize(224,224,always_apply=True),
            albumentations.Normalize(mean,std,max_pixel_value=255.0,always_apply=True),
            albumentations.CenterCrop(224, 224)
        ])
    
    train_data= TB_Dataset('drive/My Drive/TB_zindi/inputs/train_small',x_train, y_train,transform=train_aug)
    train_data_loader = torch.utils.data.DataLoader(train_data,batch_size=batch_size,
    shuffle=True,num_workers=2,pin_memory=True)


    val_data= TB_Dataset('drive/My Drive/TB_zindi/inputs/train_small',x_val, y_val,transform=val_aug)
    val_data_loader = torch.utils.data.DataLoader(val_data,batch_size=32,
    shuffle=False,num_workers=2,pin_memory=True)
    

    model = resnet152(pretrained=True)           
    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=3,threshold=1e-6,mode='min',verbose=True,factor=0.5)
    
    
    eng = Engine(model,optimizer,DEVICE)
    scaler = amp.GradScaler()
    
    best_loss = np.inf
    early_stoping = 10
    early_stoping_counter = 0

    for epoch in range(EPOCHS):
        train_loss,outputs0,targets0 = eng.train(train_data_loader,scaler)
        train_acc = metrics.roc_auc_score(targets0,outputs0)

        val_loss, outputs,targets = eng.validate(val_data_loader)
        val_acc = metrics.roc_auc_score(targets,outputs)
        scheduler.step(val_loss)

        print(f"fold={fold}",f"epoch={epoch}",f"train_loss={train_loss}",f"train_acc={train_acc}",f"val_loss={val_loss}",f"val_acc={val_acc}")
        
        
        if val_loss<best_loss:
            best_loss = val_loss
            if save_model:
                torch.save(model.state_dict(),f"drive/My Drive/TB_zindi/models/model{fold}.bin")
                print(f"fold={fold}",f"best_val_loss={best_loss}")
                print('model saved!!')
                early_stoping_counter = 0
                
        else: 
            print(early_stoping_counter)
            early_stoping_counter += 1

        if early_stoping_counter > early_stoping:
            break


for i in range(5):
    train(fold=i,save_model=True)
    

(574, 1)
(574, 1)
(144, 1)
(144, 1)


Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth


HBox(children=(FloatProgress(value=0.0, max=241530880.0), HTML(value='')))


fold=0 epoch=0 train_loss=0.41978105364574325 train_acc=0.8888443602448266 val_loss=0.4976296782493591 val_acc=0.9004437584410574
fold=0 best_val_loss=0.4976296782493591
model saved!!
fold=0 epoch=1 train_loss=0.19264883656675616 train_acc=0.9783165743709317 val_loss=0.5584522545337677 val_acc=0.9355585568203744
0
fold=0 epoch=2 train_loss=0.09740546001638803 train_acc=0.9946929952394831 val_loss=0.4942688763141632 val_acc=0.9289986494308314
fold=0 best_val_loss=0.4942688763141632
model saved!!
fold=0 epoch=3 train_loss=0.06486552737058243 train_acc=0.9978747692606625 val_loss=0.4607814162969589 val_acc=0.9475207408836581
fold=0 best_val_loss=0.4607814162969589
model saved!!
fold=0 epoch=4 train_loss=0.08678772510676128 train_acc=0.9955795200621783 val_loss=0.5900435328483582 val_acc=0.8969708662936524
0
fold=0 epoch=5 train_loss=0.11135088090991808 train_acc=0.9922520159331584 val_loss=0.35090322196483614 val_acc=0.941153771946749
fold=0 best_val_loss=0.35090322196483614
model saved!

In [None]:
##   Averaging all 5 folds

predictions = []
img_ids = []
DEVICE = "cuda"
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
for fold in range(5):

    model = resnet152(pretrained=False)
    model.load_state_dict(torch.load(f"drive/My Drive/TB_zindi/models/model{fold}.bin"))
    model= model.to(DEVICE)
    model.eval()
    test_ids = pd.read_csv('drive/My Drive/TB_zindi/inputs/Test_TB.csv')
    test_ids['ID'] = test_ids['ID'] + '.png'
    test_aug = albumentations.Compose([
            albumentations.Resize(224,224,always_apply=True),
            albumentations.Normalize(mean,std,max_pixel_value=255.0,always_apply=True),
            #albumentations.CenterCrop(224, 224)
        ])
    test_data= TB_Test_Dataset('drive/My Drive/TB_zindi/inputs/test_small',test_ids,transform=test_aug)

    test_data_loader = torch.utils.data.DataLoader(test_data,batch_size=16,
    shuffle=False,num_workers=8,pin_memory=True)
  
    with torch.no_grad():
      for idx,_data in enumerate(test_data_loader):
        img = _data['x']
        img_id = _data['img']
        img = img.to(DEVICE,dtype=torch.float)
        #print(image.shape)
        pred = model(img)
        pred = torch.sigmoid(pred)
        pred = pred.detach().cpu().numpy()
        img_ids.append(img_id)
        predictions.append(pred)


        


In [None]:
# use only the second fold in which i obtained  highest accuracy: 1 

predictions = []
img_ids = []
DEVICE = "cuda"
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
model = resnet152(pretrained=False)
model.load_state_dict(torch.load(f"drive/My Drive/TB_zindi/models/model2.bin"))
model= model.to(DEVICE)
model.eval()
test_ids = pd.read_csv('drive/My Drive/TB_zindi/inputs/Test_TB.csv')
test_ids['ID'] = test_ids['ID'] + '.png'
test_aug = albumentations.Compose([
            albumentations.Resize(224,224,always_apply=True),
            albumentations.Normalize(mean,std,max_pixel_value=255.0,always_apply=True),
            #albumentations.CenterCrop(224, 224)
        ])
test_data= TB_Test_Dataset('drive/My Drive/TB_zindi/inputs/test_small',test_ids,transform=test_aug)

test_data_loader = torch.utils.data.DataLoader(test_data,batch_size=16,
    shuffle=False,num_workers=8,pin_memory=True)
  
with torch.no_grad():
  for idx,_data in enumerate(test_data_loader):
    img = _data['x']
    img_id = _data['img']
    img = img.to(DEVICE,dtype=torch.float)
    #print(image.shape)
    pred = model(img)
    pred = torch.sigmoid(pred)
    pred = pred.detach().cpu().numpy()
    img_ids.append(img_id)
    predictions.append(pred)


        
# try fold==2 alone!!

In [None]:
# obtain img ids in the same order of the submission file
img_ID = np.concatenate(img_ids,axis=0)
print(len(img_ID[:82]))

82


In [None]:
# prediction ready to submission file
pred_0 = np.concatenate(predictions,axis=0)
print(pred_0)


[[5.3917407e-03]
 [1.8180790e-04]
 [9.9960536e-01]
 [4.2165989e-01]
 [8.0109143e-04]
 [9.9774629e-01]
 [9.9935466e-01]
 [7.7264122e-05]
 [1.5147006e-03]
 [9.9729306e-01]
 [6.0943961e-01]
 [9.9999988e-01]
 [9.1567296e-01]
 [9.9975544e-01]
 [4.4219524e-02]
 [9.9632227e-01]
 [9.9996698e-01]
 [6.8389018e-06]
 [9.9961436e-01]
 [9.1245264e-01]
 [2.2962425e-04]
 [5.9502308e-05]
 [2.6919810e-02]
 [5.4007852e-01]
 [1.1347594e-06]
 [2.6467058e-05]
 [9.9993598e-01]
 [9.9770409e-01]
 [6.5726861e-05]
 [8.8740177e-05]
 [5.7456682e-06]
 [8.3785403e-01]
 [9.8709803e-05]
 [9.9949026e-01]
 [2.2464305e-04]
 [2.2469017e-01]
 [8.3273834e-01]
 [6.5741545e-01]
 [1.6717026e-03]
 [1.0017170e-05]
 [7.3861657e-03]
 [9.1393816e-01]
 [3.0925542e-03]
 [1.3394571e-04]
 [1.4104177e-01]
 [1.8652753e-04]
 [1.3352209e-04]
 [5.8870237e-06]
 [4.4516990e-05]
 [6.8521607e-01]
 [2.2351628e-06]
 [1.1139723e-02]
 [1.1913622e-05]
 [9.9642712e-01]
 [7.0804469e-02]
 [3.4399595e-02]
 [6.5703380e-01]
 [1.1926265e-07]
 [9.9954331e-0

In [None]:
# obtain the average accuracy if you use 5 folds

n_pred = pred_0.reshape(5,82,1)
avg = np.mean(n_pred,axis=0)
print(avg.shape)

(82, 1)


In [None]:
# create submission file
# for sub['LABEL'] you can fill it with pred_0 if you choose to use only one fold prediction or replace it with the avg accuracy if you decide to use 5 folds
sub = pd.DataFrame()
sub["ID"] = img_ID[:82]
sub["ID"] = sub["ID"].str.replace('.png','')
sub["LABEL"] = pred_0

sub.to_csv('drive/My Drive/TB_zindi/inputs/nsub_TB.csv',index=False)
sub.head()

Unnamed: 0,ID,LABEL
0,GTWSHFYQ,0.005392
1,QTFSSMGD,0.000182
2,TBLBHSYT,0.999605
3,ZKETEOFG,0.42166
4,GKTPBGZP,0.000801


In [None]:
df = pd.read_csv('drive/My Drive/TB_zindi/inputs/SampleSubmission_TB.csv')
df.head()

Unnamed: 0,ID,LABEL
0,GTWSHFYQ,0
1,QTFSSMGD,0
2,TBLBHSYT,0
3,ZKETEOFG,0
4,GKTPBGZP,0
