# 📚 Import Libraries

In [1]:
!cp -r ../input/timm-pytorch-image-models . 
#!cp -r ../input/openslide .

In [2]:
!pip install -qU ./timm-pytorch-image-models/pytorch-image-models-master
#!pip install -qU ./openslide

[0m

In [3]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict
from tqdm import tqdm
import time
import os 
import copy
import gc
from openslide import OpenSlide
from PIL import Image
# visualization
import cv2
import matplotlib.pyplot as plt

# Sklearn
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold,GroupKFold 
import tifffile
# PyTorch 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

import timm

import zipfile
#import pyvips
# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Metrics 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


# For colored terminal text
from colorama import Fore, Back, Style
c_  = Fore.GREEN
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings('ignore')

# ⚙️ Configuration

In [4]:
fake_inf = len(glob("/kaggle/input/mayo-clinic-strip-ai/test/*")) == 4 
print(fake_inf)

True


In [5]:
class CFG:
    seed          = 913
    debug         = False # set debug=False for Full Training
    comment       = "eff b7 more more 0.15 satruration on data"
    n_flods       = 5
    backbone      = "convnext_tiny"
    train_bs      = 1
    valid_bs      = 1
    epochs        = 25
    lr            = 1e-4
    scheduler     = 'CosineAnnealingLR'
    min_lr        = 1e-6
    T_max         = int(30000/train_bs*epochs)+50
    T_0           = 25
    warmup_epochs = 1
    wd            = 1e-6
    n_accumulate  = 2#max(1, 32//train_bs)
    device        = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    tile_size     = (1024,1024)
    t_thr         = 0.4 # percentage of required color pixles to |keep the pic 
    faster_inf    = True #only take N_slides slides from each image 
    N_slides      = 45

In [6]:
def set_seed(seed = 42):
    np.random.seed(seed)
    #random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')
    
set_seed(CFG.seed)

> SEEDING DONE


# ❗ Data

In [7]:
Data_dir = [f"../input/cleanedrescaled384-{i}/train/*" for i in range(1,9)]

In [8]:
def get_data_info(paths , train = True):
    img_prop = defaultdict(list)
    
    for i, path in tqdm(enumerate(paths), total = len(paths),desc = "making dataframe"):
        img_info =  path.split('/')[-1]

        patient_id , image_num  , centre_id,slice_num = img_info.split("_")
        #tl_pixel = tl_pixel.split('.')[0]
        #centre_id = centre_id.split('.jpg')[0]
        
        img_prop['image_id'].append(f"{patient_id}_{image_num}")
        img_prop['patient_id'].append(patient_id)
        img_prop['image_num'].append(image_num)
        img_prop['centre_id'].append(centre_id)
        img_prop['slice_num'].append(slice_num.split('.jpg')[0])
        
        img_prop['path'].append(path)
        #img_prop['tl_pixel'].append(tl_pixel)
        
        if train:
            label = train_data[train_data["image_id"]==f"{patient_id}_{image_num}"].label.item()
            
            img_prop['label'].append(label)
            
        
        #img_prop['density'].append(extra_info)
    
    image_data = pd.DataFrame(img_prop)

    image_data.sort_values(by='image_id', inplace=True)
    image_data.reset_index(inplace=True, drop=True)
    #image_data['density'] = image_data['density'].astype(np.float16)
    
    return image_data

In [9]:
train_data = pd.read_csv("../input/mayo-clinic-strip-ai/train.csv")

In [10]:
train_images = []
for path in Data_dir:
    train_images.extend(glob(path))
#print(train_images)
df = get_data_info(train_images)
df.head()

making dataframe: 100%|██████████| 35238/35238 [00:17<00:00, 2007.44it/s]


Unnamed: 0,image_id,patient_id,image_num,centre_id,slice_num,path,label
0,006388_0,6388,0,11,42,../input/cleanedrescaled384-2/train/006388_0_1...,CE
1,006388_0,6388,0,11,0,../input/cleanedrescaled384-2/train/006388_0_1...,CE
2,006388_0,6388,0,11,8,../input/cleanedrescaled384-2/train/006388_0_1...,CE
3,006388_0,6388,0,11,17,../input/cleanedrescaled384-2/train/006388_0_1...,CE
4,006388_0,6388,0,11,24,../input/cleanedrescaled384-2/train/006388_0_1...,CE


In [11]:


keep = df['label'].value_counts().LAA*2
df = df.drop(df[df['label']=="CE"][keep//2:].index).reset_index(drop=True)
df = df.drop(df[df['label']=="LAA"][keep//2:].index).reset_index(drop=True)
df.head()



Unnamed: 0,image_id,patient_id,image_num,centre_id,slice_num,path,label
0,006388_0,6388,0,11,42,../input/cleanedrescaled384-2/train/006388_0_1...,CE
1,006388_0,6388,0,11,0,../input/cleanedrescaled384-2/train/006388_0_1...,CE
2,006388_0,6388,0,11,8,../input/cleanedrescaled384-2/train/006388_0_1...,CE
3,006388_0,6388,0,11,17,../input/cleanedrescaled384-2/train/006388_0_1...,CE
4,006388_0,6388,0,11,24,../input/cleanedrescaled384-2/train/006388_0_1...,CE


In [12]:
temp = df.groupby(by="image_id").count()

In [13]:


df["image_id"].nunique()



398

# 🔨 Utility

In [14]:
def load_img(path):
    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)

    img = img.astype('float32') 
    return img

def show_img(img, ground_truth, pred = "", conf = ""):
    plt.imshow(img)
    plt.title(f'true: {"CE" if ground_truth else "LAA"} | predicted: {pred} | conf: {conf}')
    plt.axis('off')
    

In [15]:


skf = StratifiedGroupKFold(n_splits=CFG.n_flods, shuffle=True, random_state=CFG.seed)

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['label'], groups = df["image_id"])):
    df.loc[val_idx, 'fold'] = fold
display(df.groupby(['fold','label'])['image_id'].count())



fold  label
0.0   CE       2000
      LAA      1955
1.0   CE       2000
      LAA      1950
2.0   CE       1850
      LAA      2098
3.0   CE       1904
      LAA      2048
4.0   CE       2147
      LAA      1850
Name: image_id, dtype: int64

# ❗ DataLoaders

In [16]:
class StripAiDataset(Dataset):
    def __init__(self, df, N_slides, label=False ,transforms = None):
        self.df = df
        self.transforms = transforms
        self.image_ids = df['image_id'].unique().tolist()
        self.file_names = df['path'].tolist()
        self.patient_id = df['patient_id'].tolist()
        self.labels = df.label.apply(lambda x: 1 if x == "CE" else 0).tolist()
        self.N_slides = N_slides
    def __len__(self):
        return len(self.image_ids)
  
    def __getitem__(self,index):
        img_id = self.image_ids[index]
        tempdf =self.df[self.df["image_id"] == img_id]
        patient_id = tempdf['patient_id'].tolist()
        img_paths = tempdf.path.sample(self.N_slides,replace=True).tolist()
        img = []
        for path in img_paths:  
            temp = self.transforms(image=load_img(path))["image"]
            temp = np.transpose(temp, (2, 0, 1))
            img.append(temp)
        #print(np.array(img).shape)

        img = np.stack(img , axis = 0).astype(np.float32)
        label = tempdf[0:1].label.item()
        label = 1 if label == "CE" else 0
        #print(img.shape)
        
    
    
        return torch.tensor(img), patient_id[0] , label

In [17]:


data_transforms = {
    "train": A.Compose([ #A.RandomBrightnessContrast(p=0.2),
                        A.Flip(p=0.5),
                        #A.ChannelShuffle(),
                        #A.RandomGridShuffle(), 
                        #A.ColorJitter(),
                       A.Normalize()], p=1.0),
    "valid": A.Compose([A.Normalize()], p=1.0)
}



In [18]:
def prepare_loaders(test_df,fold,debug=False):
 
    test_df = test_df.query("fold==@fold").reset_index(drop=True)
    test_dataset = StripAiDataset(test_df,transforms=data_transforms['valid'],N_slides = CFG.N_slides)

    test_loader = DataLoader(test_dataset, batch_size=CFG.valid_bs, 
                              num_workers=1, shuffle=False, pin_memory=False)
    
    return test_loader

In [19]:
class Flatten(nn.Module):
    def __init__(self, dim=1):
        super().__init__()
        self.dim = dim

    def forward(self, x): 
        input_shape = x.shape
        output_shape = [input_shape[i] for i in range(self.dim)] + [-1]
        return x.view(*output_shape)

In [20]:


class StripModel(nn.Module):

    def __init__(self, model_name, num_classes=2, pretrained=True ,num_instances=CFG.N_slides , path=""):
        super().__init__()
        self.num_instances = num_instances
        self.encoder = timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)

        
        
        feature_dim = self.encoder.get_classifier().in_features
        self.encoder.head.fc = nn.Identity()
        self.feature_dim = feature_dim
        print(feature_dim)
        
        self.head = nn.Sequential(
            nn.Conv3d(self.num_instances ,1,(1,12,12)), nn.ReLU(inplace=True), Flatten(),
            nn.Linear(feature_dim, 256), nn.ReLU(inplace=True), 
            nn.Linear(256, 64), nn.ReLU(inplace=True), 
            nn.Linear(64, num_classes)
        )


    def forward(self, x):
        # x: bs x N x C x W x W
        bs, _, ch, w, h = x.shape
        x = x.view(bs*self.num_instances, ch, w, h) # x: N bs x C x W x W
        x = self.encoder.forward_features(x) # x: N bs x C' x W' x W'

        # Concat and pool
        #bs2, ch2, w2, h2 = x.shape
        #x = x.view(-1, self.num_instances, ch2, w2, h2).permute(0, 2, 1, 3, 4)\
            #.contiguous().view(bs, ch2, self.num_instances*w2, h2) # x: bs x C' x N W'' x W''
        emb = self.head(x)

        return emb,self.encoder.head(x)
    

        
        
        



In [21]:
def get_model(path):
    model = StripModel(CFG.backbone ,pretrained = False)
    model.to(CFG.device)
    model.load_state_dict(torch.load(path,map_location=torch.device(CFG.device)))
    model.eval()
    return model

# 🔧 Loss Function

# 🚄 Training Function

In [22]:
@torch.no_grad()
def infer(model_ptahs, test_loader , preds):

    model     = get_model(model_ptahs) 
    soft = nn.Softmax()
    for idx, (img,img_id,label) in enumerate(tqdm(test_loader, total=len(test_loader), desc='Infer ')):
        img = img.to(CFG.device, dtype=torch.float)# .squeeze(0)
        out,_ = model(img)#.view(-1)
        out = soft(out).cpu().detach().numpy()
            
        preds["img_id"].extend(img_id)
        preds["CE"].extend(out[:,0])
        preds["LAA"].extend(out[:,1])
        preds["label"].extend(label)


            #out = model(img)#.squeeze()
        
        del img,  out
        gc.collect()
        torch.cuda.empty_cache()

    return preds

In [23]:
model_ptahs = ["../input/all5fold/best_epoch-0-37-0.6052189344461535.bin","../input/all5fold/best_epoch-1-24-0.6121961653862201.bin","../input/all5fold/best_epoch-2-03-0.6613527477627069.bin","../input/all5fold/best_epoch-3-06-0.6782178148417406.bin","../input/all5fold/best_epoch-4-07-0.655440097621509.bin" ]#,"../input/good0rbad/best_epoch-4-10-0.6704924667254091.bin"]

In [24]:
preds = defaultdict(list)
for i in range(5):
    test_loader  = prepare_loaders(df,i)
    preds = infer(model_ptahs[i],test_loader,preds)

768


Infer : 100%|██████████| 80/80 [01:12<00:00,  1.11it/s]


768


Infer : 100%|██████████| 79/79 [01:07<00:00,  1.17it/s]


768


Infer : 100%|██████████| 79/79 [01:05<00:00,  1.20it/s]


768


Infer : 100%|██████████| 80/80 [01:07<00:00,  1.19it/s]


768


Infer : 100%|██████████| 80/80 [01:07<00:00,  1.18it/s]


In [25]:
preds = pd.DataFrame(preds)

In [26]:
preds

Unnamed: 0,img_id,CE,LAA,label
0,006388,0.573381,0.426619,tensor(1)
1,029c68,0.381466,0.618534,tensor(1)
2,03d1ec,0.235467,0.764533,tensor(0)
3,0468a8,0.826008,0.173992,tensor(1)
4,055f6a,0.422620,0.577381,tensor(0)
...,...,...,...,...
393,dba56f,0.483596,0.516404,tensor(0)
394,f3e9f6,0.527918,0.472082,tensor(0)
395,f5ce23,0.590843,0.409157,tensor(0)
396,f83bf0,0.756116,0.243884,tensor(0)


In [27]:
preds

Unnamed: 0,img_id,CE,LAA,label
0,006388,0.573381,0.426619,tensor(1)
1,029c68,0.381466,0.618534,tensor(1)
2,03d1ec,0.235467,0.764533,tensor(0)
3,0468a8,0.826008,0.173992,tensor(1)
4,055f6a,0.422620,0.577381,tensor(0)
...,...,...,...,...
393,dba56f,0.483596,0.516404,tensor(0)
394,f3e9f6,0.527918,0.472082,tensor(0)
395,f5ce23,0.590843,0.409157,tensor(0)
396,f83bf0,0.756116,0.243884,tensor(0)


In [28]:
preds.to_csv("tile infos.csv", index = False)