In [1]:
!pip install /kaggle/input/pretrainedmodelswhl/pretrainedmodels-0.7.4-py3-none-any.whl
!pip install /kaggle/input/testtimeaug/ttach-0.0.2-py3-none-any.whl

Processing /kaggle/input/pretrainedmodelswhl/pretrainedmodels-0.7.4-py3-none-any.whl
Installing collected packages: pretrainedmodels
Successfully installed pretrainedmodels-0.7.4
Processing /kaggle/input/testtimeaug/ttach-0.0.2-py3-none-any.whl
Installing collected packages: ttach
Successfully installed ttach-0.0.2


In [2]:
import os

import math
import openslide
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import albumentations
from tqdm import tqdm
from joblib import Parallel, delayed
from matplotlib import pyplot as plt
from PIL import Image, ImageChops
import pretrainedmodels

import cv2

import torch
import torch.utils.data as data_utils
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torch.optim import lr_scheduler
from torch import nn

from torchvision import transforms,models
import torch.nn.functional as F
from tqdm.auto import tqdm
from torch import Tensor

import ttach as tta

In [3]:
BASE_DIR = '/kaggle/input/prostate-cancer-grade-assessment'
# DATA_DIR = os.path.join(BASE_DIR, 'train_images')
DATA_DIR = os.path.join(BASE_DIR, 'test_images')
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
# test_df = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'))[['image_id', 'data_provider']].loc[:10]
# test_df

In [5]:
test_df = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'))
sample_sub_df = pd.read_csv(os.path.join(BASE_DIR, 'sample_submission.csv'))

In [6]:
org_test_df = test_df.copy()
org_test_df

Unnamed: 0,image_id,data_provider
0,005700be7e06878e6605e7a5a39de1b2,radboud
1,005c6e8877caf724c600fdce5d417d40,karolinska
2,0104f76634ff89bfff1ef0804a95c380,radboud


In [7]:
crop_size = 256  # Size of resultant images
crop_level = 2  # The level of slide used to get the images (you can use 0 to get very high resolution images)
down_samples = [1, 4, 16]  # List of down samples available in any tiff image file

In [8]:
def split_image(openslide_image):
    """
    Splits the given image into multiple images if 256x256
    """
    
    # Get the size of the given image
    width, height = openslide_image.level_dimensions[crop_level]

    # Get the dimensions of level 0 resolution, as it's required in "read_region()" function
    base_height = down_samples[crop_level] * height  # height of level 0
    base_width = down_samples[crop_level] * width  # width of level 0

    # Get the number of smaller images 
    h_crops = math.ceil(width / crop_size)
    v_crops = math.ceil(height / crop_size)

    splits = []
    for v in range(v_crops):
        for h in range(h_crops): 
            x_location = h*crop_size*down_samples[crop_level]
            y_location = v*crop_size*down_samples[crop_level]

            patch = openslide_image.read_region((x_location, y_location), crop_level, (crop_size, crop_size))

            splits.append(patch)
    return splits, h_crops, v_crops

In [9]:
def get_emptiness(arr):
    total_ele = arr.size
    white_ele = np.count_nonzero(arr == 255) + np.count_nonzero(arr == 0)
    return white_ele / total_ele

In [10]:
ignore_threshold = 0.95  # If the image is more than 95% empty, consider it as white and ignore

In [11]:
def filter_white_images(images):
    non_empty_crops = []
    for image in images:
        image_arr = np.array(image)[...,:3]  # Discard the alpha channel
        emptiness = get_emptiness(image_arr)
        if emptiness < ignore_threshold:
            non_empty_crops.append(image)
    return non_empty_crops

In [12]:
dataset = []
def create_dataset(count):
    img = os.path.join(DATA_DIR, f'{test_df["image_id"].iloc[count]}.tiff')
    img = openslide.OpenSlide(img)
    crops, _, _ = split_image(img)
    img.close()

    non_empty_crops = filter_white_images(crops)
    image_id = test_df['image_id'].iloc[count]

    for index, img in enumerate(non_empty_crops):
        img_metadata = {}
        img = img.convert('RGB')

        img_metadata['image_id'] = f'{image_id}_{index}'
        img_metadata['data_provider'] = test_df['data_provider'].iloc[count]
        img_metadata['group'] = count

        img.save(f'{image_id}_{index}.jpg', 'JPEG', quality=100, optimize=True, progressive=True)
        dataset.append(img_metadata)
    return dataset

In [13]:
if os.path.exists(DATA_DIR):
    dataset = Parallel(n_jobs=8)(delayed(create_dataset)(count) for count in tqdm(range(len(test_df))))
    dataset = [item for sublist in dataset for item in sublist]

    dataset = pd.DataFrame(dataset)
    dataset.to_csv('new_test.csv', index=False)
    test_df = pd.read_csv('new_test.csv')

In [14]:
class DenseNet121(nn.Module):
    """
    Define DenseNet121 model with 10 output classes based on gleason scores
    """
    def __init__(self, pretrained):
        super(DenseNet121, self).__init__()
        if pretrained is True:
            self.model = pretrainedmodels.__dict__["densenet121"](pretrained="imagenet")
        else:
            self.model = pretrainedmodels.__dict__["densenet121"](pretrained=None)
        
        self.l0 = nn.Linear(1024, 10)

    def forward(self, x):
        bs, _, _, _ = x.shape
        x = self.model.features(x)
        x = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
        l0 = self.l0(x)
        return l0

In [15]:
FOLDS = 5

In [16]:
# Define TTA parameters
transforms = tta.Compose(
    [
        tta.HorizontalFlip(),
        tta.VerticalFlip(),
        tta.Scale(scales=[1, 2])       
    ]
)

In [17]:
models = []
for fold in range(FOLDS):
    model = DenseNet121(False)
    model.to(DEVICE)
    model.load_state_dict(torch.load(f'/kaggle/input/panda-densenet121-labelsmoothing-tta-fold{fold}/densenet121_ls_fold{fold}.pth', map_location=DEVICE))
    model = tta.ClassificationTTAWrapper(model, transforms)
    model.eval()
    models.append(model)

In [18]:
WORKING_DIR = os.path.join('/', 'kaggle', 'working')

In [19]:
class PandaDataset(Dataset):
    """Custom dataset for PANDA Tests"""
    
    def __init__(self, df, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
        self.df = df
        self.aug = albumentations.Compose([
            albumentations.Normalize(mean, std, always_apply=True)
        ])
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        image_id = self.df.loc[index]['image_id']
        image = cv2.imread(os.path.join(WORKING_DIR, f'{image_id}.jpg'))
        image = self.aug(image=image)['image']
        
        # Convert from NHWC to NCHW as pytorch expects images in NCHW format
        image = np.transpose(image, (2, 0, 1))
        
        # For now, just return image and ISUP grades
        return image

In [20]:
BATCH_SIZE=16

In [21]:
def inference(model, test_loader, device):
    preds = []
    for i, images in tqdm(enumerate(test_loader)):
        images = images.to(device, dtype=torch.float)/255
            
        with torch.no_grad():
            y_preds = model(images)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

In [22]:
# Gleason score 6 = ISUP grade 1
# Gleason score 7 (3 + 4) = ISUP grade 2
# Gleason score 7 (4 + 3) = ISUP grade 3
# Gleason score 8 = ISUP grade 4
# Gleason score 9-10 = ISUP grade 5

'''
Mappings of gleason scores when model was trained
     0: '0+0',
     1: '3+3',
     2: '3+4',
     3: '3+5',
     4: '4+3',
     5: '4+4',
     6: '4+5',
     7: '5+3',
     8: '5+4',
     9: '5+5'
'''

mappings = {
    0: 0, # (0+0: ISUP grade 0)
    1: 1, # (3+3: ISUP grade 1)
    2: 2, # (3+4: ISUP grade 2)
    3: 4, # (3+5: ISUP grade 4)
    4: 3, # (4+3: ISUP grade 3)
    5: 4, # (4+4: ISIP grade 4)
    6: 5, # (4+5: ISUP grade 5)
    7: 4, # (5+3: ISUP grade 4)
    8: 5, # (5+4: ISUP grade 5)
    9: 5  # (5+5: ISUP grade 5)
}

In [23]:
def submit(sample):
    global sample_sub_df
    if os.path.exists(DATA_DIR):
        test_dataset = PandaDataset(test_df)
        test_loader = data_utils.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        preds = []
        for fold in range(FOLDS):
            preds.append(inference(models[fold], test_loader, DEVICE))
        preds = np.array(preds)
        preds = np.argmax(np.mean(preds, axis=0), axis=1)
        preds = np.vectorize(mappings.get)(preds)
        test_df['preds'] = preds
        sample = sample.drop(['data_provider'], axis=1)
        sample['isup_grade'] = test_df.groupby('group')['preds'].agg(lambda x:x.value_counts().index[0])
        return sample
    return sample_sub_df

In [24]:
submission = submit(org_test_df)
submission['isup_grade'] = submission['isup_grade'].astype(int)
submission.head()

Unnamed: 0,image_id,isup_grade
0,005700be7e06878e6605e7a5a39de1b2,0
1,005c6e8877caf724c600fdce5d417d40,0
2,0104f76634ff89bfff1ef0804a95c380,0


In [25]:
!rm -rf *

In [26]:
submission.to_csv('submission.csv', index=False)