# Blend notebook "SE-ResNeXt50 full GPU decoding" and "RSNA ConvNextV2 Inference Tensorflow"
## "SE-ResNeXt50 full GPU decoding" is:
https://www.kaggle.com/code/christofhenkel/se-resnext50-full-gpu-decoding/notebook

## "RSNA ConvNextV2 Inference Tensorflow" is:
https://www.kaggle.com/code/markwijkhuizen/rsna-convnextv2-inference-tensorflow/notebook

**Thank you Dieter (@christofhenkel)** for public pretrained model `rsna-seresnext50-5fold`

and

**Thank you Mark Wijkhuizen (@markwijkhuizen)** for public pretrained model `rsna-efficientnetv2-training-tensorflow-tpu-ds`

In [1]:
# Install Keras CV Attention Model Pip Package for ConvNextV2 Models
!pip install --no-deps /kaggle/input/keras-cv-attention-models/keras_cv_attention_models-1.3.9-py3-none-any.whl

Processing /kaggle/input/keras-cv-attention-models/keras_cv_attention_models-1.3.9-py3-none-any.whl
Installing collected packages: keras-cv-attention-models
Successfully installed keras-cv-attention-models-1.3.9
[0m

In [2]:
!pip install -q timm==0.6.5 --no-index --find-links=/kaggle/input/rsna-bc-pip-requirements
!pip install -q albumentations==1.2.1 --no-index --find-links=/kaggle/input/rsna-bc-pip-requirements
!pip install -q pylibjpeg-libjpeg==1.3.1 --no-index --find-links=/kaggle/input/rsna-bc-pip-requirements
!pip install -q pydicom==2.0.0 --no-index --find-links=/kaggle/input/rsna-bc-pip-requirements
!pip install -q python-gdcm==3.0.20 --no-index --find-links=/kaggle/input/rsna-bc-pip-requirements
!pip install -q dicomsdl==0.109.1 --no-index --find-links=/kaggle/input/rsna-bc-pip-requirements

[0m

Then we install the latest DALI packaging which we will use for GPU decoding

In [3]:
!pip install -q /kaggle/input/nvidia-dali-nightly-cuda110-1230dev/nvidia_dali_nightly_cuda110-1.23.0.dev20230203-7187866-py3-none-manylinux2014_x86_64.whl

[0m

Next, we import all the packages we need and patch a function to allow for INT16 support

In [4]:
import timm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from copy import copy
import gc
import shutil 

import glob
from scipy.special import expit

import albumentations as A
import cv2
cv2.setNumThreads(0)

import dicomsdl
import pydicom
from pydicom.filebase import DicomBytesIO

from os.path import join

from tqdm import tqdm

from joblib import Parallel, delayed
import multiprocessing as mp

from types import SimpleNamespace
from typing import Any, Dict

import torch
import torch.nn.functional as F
from torch import nn
from torch.nn.parameter import Parameter
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast


import nvidia.dali.fn as fn
import nvidia.dali.types as types
from nvidia.dali import pipeline_def
from nvidia.dali.types import DALIDataType

In [5]:
#we need to patch DALI for Int16 support


from nvidia.dali.backend import TensorGPU, TensorListGPU
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
from nvidia.dali import types
from nvidia.dali.plugin.base_iterator import _DaliBaseIterator
from nvidia.dali.plugin.base_iterator import LastBatchPolicy
import torch
import torch.utils.dlpack as torch_dlpack
import ctypes
import numpy as np
import torch.nn.functional as F
import pydicom

to_torch_type = {
    types.DALIDataType.FLOAT:   torch.float32,
    types.DALIDataType.FLOAT64: torch.float64,
    types.DALIDataType.FLOAT16: torch.float16,
    types.DALIDataType.UINT8:   torch.uint8,
    types.DALIDataType.INT8:    torch.int8,
    types.DALIDataType.UINT16:  torch.int16,
    types.DALIDataType.INT16:   torch.int16,
    types.DALIDataType.INT32:   torch.int32,
    types.DALIDataType.INT64:   torch.int64
}


def feed_ndarray(dali_tensor, arr, cuda_stream=None):
    """
    Copy contents of DALI tensor to PyTorch's Tensor.

    Parameters
    ----------
    `dali_tensor` : nvidia.dali.backend.TensorCPU or nvidia.dali.backend.TensorGPU
                    Tensor from which to copy
    `arr` : torch.Tensor
            Destination of the copy
    `cuda_stream` : torch.cuda.Stream, cudaStream_t or any value that can be cast to cudaStream_t.
                    CUDA stream to be used for the copy
                    (if not provided, an internal user stream will be selected)
                    In most cases, using pytorch's current stream is expected (for example,
                    if we are copying to a tensor allocated with torch.zeros(...))
    """
    dali_type = to_torch_type[dali_tensor.dtype]

    assert dali_type == arr.dtype, ("The element type of DALI Tensor/TensorList"
                                    " doesn't match the element type of the target PyTorch Tensor: "
                                    "{} vs {}".format(dali_type, arr.dtype))
    assert dali_tensor.shape() == list(arr.size()), \
        ("Shapes do not match: DALI tensor has size {0}, but PyTorch Tensor has size {1}".
            format(dali_tensor.shape(), list(arr.size())))
    cuda_stream = types._raw_cuda_stream(cuda_stream)

    # turn raw int to a c void pointer
    c_type_pointer = ctypes.c_void_p(arr.data_ptr())
    if isinstance(dali_tensor, (TensorGPU, TensorListGPU)):
        stream = None if cuda_stream is None else ctypes.c_void_p(cuda_stream)
        dali_tensor.copy_to_external(c_type_pointer, stream, non_blocking=True)
    else:
        dali_tensor.copy_to_external(c_type_pointer)
    return arr





Next I set major variables which handle the public run and the re-run on the hidden test set, and also allow for simulating the size of the hidden test set by setting RAM_CHECK = True

In [6]:
# Params

CROP_IMAGE_1st = False
CROP_IMAGE_2nd = True

COMP_FOLDER = '/kaggle/input/rsna-breast-cancer-detection/'
DATA_FOLDER = COMP_FOLDER + 'test_images/'

sample_submission = pd.read_csv(COMP_FOLDER + 'sample_submission.csv')

PUBLIC_RUN = len(sample_submission) == 2

N_CORES = mp.cpu_count()
MIXED_PRECISION = False
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

RAM_CHECK = True
DEBUG = True

test_df = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/test.csv')
test_df['cancer'] = 0 #dummy value


if PUBLIC_RUN is False:
    RAM_CHECK = False
    DEBUG = False

if RAM_CHECK is True:
    test_df = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv')
    patient_filter = list(sorted((set(test_df.patient_id.unique()))))[:8000]
    test_df = test_df[test_df.patient_id.isin(patient_filter)]
    DATA_FOLDER = DATA_FOLDER.replace('test','train')

if DEBUG is True:
    test_df = test_df.head(1500)

test_df

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1,11575,2120330769,R,CC,44.0,0,0,0,1.0,0,C,49,False
1496,1,11583,376242680,L,MLO,61.0,0,0,0,,0,B,49,False
1497,1,11583,1044545235,L,MLO,61.0,0,0,0,,0,B,49,False
1498,1,11583,1338884652,L,CC,61.0,0,0,0,,0,B,49,False


In [7]:
print(f'Len df : {len(test_df)}')
test_df['patient_id'].nunique()

Len df : 1500


328

In [8]:
test_df["fns"] = test_df['patient_id'].astype(str) + '/' + test_df['image_id'].astype(str) + '.dcm'

Next, we define the function for GPU-based decoding using DALI and processing the dicom images

In [9]:
# Smooth vector used to smoothen sums/stds of axes
def smooth(l):
    # kernel size is 1% of vector
    kernel_size = int(len(l) * 0.01)
    kernel = np.ones(kernel_size) / kernel_size
    return np.convolve(l, kernel, mode='same')

# X Crop offset based on first column with sum below 5% of maximum column sums*std
def get_x_offset(image, max_col_sum_ratio_threshold=0.05, debug=None):
    # Image Dimensions
    H, W = image.shape
    # Percentual margin added to offset
    margin = int(image.shape[1] * 0.00)
    # Threshold values based on smoothed sum x std to capture varying intensity columns
    vv = smooth(image.sum(axis=0).squeeze()) * smooth(image.std(axis=0).squeeze())
    # Find maximum sum in first 75% of columns
    vv_argmax = vv[:int(image.shape[1] * 0.75)].argmax()
    # Threshold value
    vv_threshold = vv.max() * max_col_sum_ratio_threshold
    
    # Find first column after maximum column below threshold value
    for offset, v in enumerate(vv):
        # Start searching from vv_argmax
        if offset < vv_argmax:
            continue
        
        # Column below threshold value found
        if v < vv_threshold:
            offset = min(W, offset + margin)
            break
            
    if isinstance(debug, np.ndarray):
        debug[1].imshow(image)
        debug[1].set_title('X Offset')
        vv_scale = H / vv.max() * 0.90
        # Values
        debug[1].plot(H - vv * vv_scale , c='red', label='vv')
        # Threshold
        debug[1].hlines(H - vv_threshold * vv_scale, 0, W -1, colors='orange', label='threshold')
        # Max Value
        debug[1].scatter(vv_argmax, H - vv[vv_argmax] * vv_scale, c='blue', s=100, label='Max', zorder=np.PINF)
        # First Column Below Threshold
        debug[1].scatter(offset, H - vv[offset] * vv_scale, c='purple', s=100, label='Offset', zorder=np.PINF)
        debug[1].set_ylim(H, 0)
        debug[1].legend()
        debug[1].axis('off')
        
    return offset

# Y Crop offset based on first bottom and top rows with sum below 10% of maximum row sum*std
def get_y_offsets(image, max_row_sum_ratio_threshold=0.10, debug=None):
    # Image Dimensions
    H, W = image.shape
    # Margin to add to offsets
    margin = 0
    # Threshold values based on smoothed sum x std to capture varying intensity columns
    vv = smooth(image.sum(axis=1).squeeze()) * smooth(image.std(axis=1).squeeze())
    # Find maximum sum * std row in inter quartile rows
    vv_argmax = int(image.shape[0] * 0.25) + vv[int(image.shape[0] * 0.25):int(image.shape[0] * 0.75)].argmax()
    # Threshold value
    vv_threshold = vv.max() * max_row_sum_ratio_threshold
    # Default crop offsets
    offset_bottom = 0
    offset_top = H

    # Bottom offset, search from argmax to bottom
    for offset in reversed(range(0, vv_argmax)):
        v = vv[offset]
        if v < vv_threshold:
            offset_bottom = offset
            break
    
    if isinstance(debug, np.ndarray):
        debug[2].imshow(image)
        debug[2].set_title('Y Bottom Offset')
        vv_scale = W / vv.max() * 0.90
        # Values
        debug[2].plot(vv * vv_scale, np.arange(H), c='red', label='vv')
        # Threshold
        debug[2].vlines(vv_threshold * vv_scale, 0, H -1, colors='orange', label='threshold')
        # Max Value
        debug[2].scatter(vv[vv_argmax] * vv_scale, vv_argmax, c='blue', s=100, label='Max', zorder=np.PINF)
        # First Column Below Threshold
        debug[2].scatter(vv[offset_bottom] * vv_scale, offset_bottom, c='purple', s=100, label='Offset', zorder=np.PINF)
        debug[2].set_ylim(H, 0)
        debug[2].legend()
        debug[2].axis('off')
            
    # Top offset, search from argmax to top
    for offset in range(vv_argmax, H):
        v = vv[offset]
        if v < vv_threshold:
            offset_top = offset
            break
            
    if isinstance(debug, np.ndarray):
        debug[3].imshow(image)
        debug[3].set_title('Y Top Offset')
        vv_scale = W / vv.max() * 0.90
        # Values
        debug[3].plot(vv * vv_scale, np.arange(H) , c='red', label='vv')
        # Threshold
        debug[3].vlines(vv_threshold * vv_scale, 0, H -1, colors='orange', label='threshold')
        # Max Value
        debug[3].scatter(vv[vv_argmax] * vv_scale, vv_argmax, c='blue', s=100, label='Max', zorder=np.PINF)
        # First Column Below Threshold
        debug[3].scatter(vv[offset_top] * vv_scale, offset_top, c='purple', s=100, label='Offset', zorder=np.PINF)
        debug[2].set_ylim(H, 0)
        debug[3].legend()
        debug[3].axis('off')
            
    return max(0, offset_bottom - margin), min(image.shape[0], offset_top + margin)

# Crop image and pad offsets to target image height/width ratio to preserve information
def crop(image, size=None, debug=False):
    # Image dimensions
    H, W = image.shape
    # Compute x/bottom/top offsets
    x_offset = get_x_offset(image, debug=debug)
    offset_bottom, offset_top = get_y_offsets(image[:,:x_offset], debug=debug)
    # Crop Height and Width
    h_crop = offset_top - offset_bottom
    w_crop = x_offset
    
    # Pad crop offsets to target aspect ratio
    if size is not None:
        # Height too large, pad x offset
        if (h_crop / w_crop) > TARGET_HEIGHT_WIDTH_RATIO:
            x_offset += int(h_crop / TARGET_HEIGHT_WIDTH_RATIO - w_crop)
        else:
            # Height too small, pad bottom/top offsets
            offset_bottom -= int(0.50 * (w_crop * TARGET_HEIGHT_WIDTH_RATIO - h_crop))
            offset_bottom_correction = max(0, -offset_bottom)
            offset_bottom += offset_bottom_correction

            offset_top += int(0.50 * (w_crop * TARGET_HEIGHT_WIDTH_RATIO - h_crop))
            offset_top += offset_bottom_correction
        
    # Crop Image
    image = image[offset_bottom:offset_top:,:x_offset]
        
    return image

In [10]:
def convert_dicom_to_jpg(file, save_folder=""):
    patient = file.split('/')[-2]
    image = file.split('/')[-1][:-4]
    dcmfile = pydicom.dcmread(file)

    if dcmfile.file_meta.TransferSyntaxUID == '1.2.840.10008.1.2.4.90':
        with open(file, 'rb') as fp:
            raw = DicomBytesIO(fp.read())
            ds = pydicom.dcmread(raw)
        offset = ds.PixelData.find(b"\x00\x00\x00\x0C")  #<---- the jpeg2000 header info we're looking for
        hackedbitstream = bytearray()
        hackedbitstream.extend(ds.PixelData[offset:])
        with open(save_folder + f"{patient}_{image}.jpg", "wb") as binary_file:
            binary_file.write(hackedbitstream)
            
    if dcmfile.file_meta.TransferSyntaxUID == '1.2.840.10008.1.2.4.70':
        with open(file, 'rb') as fp:
            raw = DicomBytesIO(fp.read())
            ds = pydicom.dcmread(raw)
        offset = ds.PixelData.find(b"\xff\xd8\xff\xe0")  #<---- the jpeg lossless header info we're looking for
        hackedbitstream = bytearray()
        hackedbitstream.extend(ds.PixelData[offset:])
        with open(save_folder + f"{patient}_{image}.jpg", "wb") as binary_file:
            binary_file.write(hackedbitstream)

            
@pipeline_def
def jpg_decode_pipeline(jpgfiles):
    jpegs, _ = fn.readers.file(files=jpgfiles)
    images = fn.experimental.decoders.image(jpegs, device='mixed', output_type=types.ANY_DATA, dtype=DALIDataType.UINT16)
    return images

def parse_window_element(elem):
    if type(elem)==list:
        return float(elem[0])
    if type(elem)==str:
        return float(elem)
    if type(elem)==float:
        return elem
    if type(elem)==pydicom.dataelem.DataElement:
        try:
            return float(elem[0])
        except:
            return float(elem.value)
    return None

def linear_window(data, center, width):
    lower, upper = center - width // 2, center + width // 2
    data = torch.clamp(data, min=lower, max=upper)
    return data 

def process_dicom(img, dicom):
    try:
        invert = getattr(dicom, "PhotometricInterpretation", None) == "MONOCHROME1"
    except:
        invert = False
        
    center = parse_window_element(dicom["WindowCenter"]) 
    width = parse_window_element(dicom["WindowWidth"])
        
    if (center is not None) & (width is not None):
        img = linear_window(img, center, width)

    img = (img - img.min()) / (img.max() - img.min())
    if invert:
        img = 1 - img
    return img

In [11]:
cfg = SimpleNamespace(**{})
cfg.img_size = 1024
cfg.backbone = 'seresnext50_32x4d'
cfg.pretrained=False
cfg.in_channels = 1
cfg.classes = ['cancer']
cfg.batch_size = 8
cfg.data_folder = "/tmp/output/"
cfg.val_aug = A.CenterCrop(always_apply=False, p=1.0, height=cfg.img_size, width=cfg.img_size)
cfg.device = DEVICE

We will process the dicoms in chunks so the disk space does not become an issue. 

In [12]:
SAVE_SIZE = int(cfg.img_size * 1.125)
SAVE_FOLDER = cfg.data_folder
os.makedirs(SAVE_FOLDER, exist_ok=True)
N_CHUNKS = len(test_df["fns"]) // 2000 if len(test_df["fns"]) > 2000 else 1
CHUNKS = [(len(test_df["fns"]) / N_CHUNKS * k, len(test_df["fns"]) / N_CHUNKS * (k + 1)) for k in range(N_CHUNKS)]
CHUNKS = np.array(CHUNKS).astype(int)
JPG_FOLDER = "/tmp/jpg/"

In [13]:


for ttt, chunk in enumerate(CHUNKS):
    print(f'chunk {ttt} of {len(CHUNKS)} chunks')
    os.makedirs(JPG_FOLDER, exist_ok=True)

    _ = Parallel(n_jobs=2)(
        delayed(convert_dicom_to_jpg)(f'{DATA_FOLDER}/{img}', save_folder=JPG_FOLDER)
        for img in test_df["fns"].tolist()[chunk[0]: chunk[1]]
    )
    
    jpgfiles = glob.glob(JPG_FOLDER + "*.jpg")


    pipe = jpg_decode_pipeline(jpgfiles, batch_size=1, num_threads=2, device_id=0)
    pipe.build()

    for i, f in enumerate(tqdm(jpgfiles)):
        
        patient, dicom_id = f.split('/')[-1][:-4].split('_')
        dicom = pydicom.dcmread(DATA_FOLDER + f"/{patient}/{dicom_id}.dcm")
        try:
            out = pipe.run()
            # Dali -> Torch
            img = out[0][0]
            img_torch = torch.empty(img.shape(), dtype=torch.int16, device="cuda")
            feed_ndarray(img, img_torch, cuda_stream=torch.cuda.current_stream(device=0))
            img = img_torch.float()

            
            del out, img_torch

            #apply dicom preprocessing
            img = process_dicom(img, dicom)

            #resize the torch image
            img = F.interpolate(img.view(1, 1, img.size(0), img.size(1)), (SAVE_SIZE*2, SAVE_SIZE*2), mode="bilinear")[0, 0]

            img = (img * 255).clip(0,255).to(torch.uint8).cpu().numpy()
            out_file_name = SAVE_FOLDER + f"{patient}_{dicom_id}.png"
            cv2.imwrite(out_file_name, img)

            del out_file_name, img, dicom

        except Exception as e:
            print(i, e)
            pipe = jpg_decode_pipeline(jpgfiles[i+1:], batch_size=1, num_threads=2, device_id=0)
            pipe.build()
            continue

    shutil.rmtree(JPG_FOLDER)
print(f'DALI Raw image load complete')

chunk 0 of 1 chunks


100%|██████████| 1500/1500 [04:32<00:00,  5.51it/s]


DALI Raw image load complete


In [14]:
fns = glob.glob(f'{SAVE_FOLDER}/*.png')
n_saved = len(fns)
print(f'Image on disk count : {n_saved}')

Image on disk count : 1500


A few hidden test set images might not be decoded via DALI, so we fallback to CPU for those

In [15]:
gpu_processed_files = [fn.split('/')[-1].replace('_','/').replace('png','dcm') for fn in fns]
to_process = [f for f in test_df["fns"].values if f not in gpu_processed_files]
len(gpu_processed_files), len(to_process)

(1500, 0)

In [16]:

def process(f, save_folder=""):
    patient = f.split('/')[-2]
    dicom_id = f.split('/')[-1][:-4]
    
    dicom = dicomsdl.open(f)
    img = dicom.pixelData()
    
    img = torch.from_numpy(img)
    img = process_dicom(img, dicom)
    
    img = F.interpolate(img.view(1, 1, img.size(0), img.size(1)), (SAVE_SIZE*2, SAVE_SIZE*2), mode="bilinear")[0, 0]

    img = (img * 255).clip(0,255).to(torch.uint8).cpu().numpy()
    out_file_name = SAVE_FOLDER + f"{patient}_{dicom_id}.png"
    cv2.imwrite(out_file_name, img)
    return out_file_name

In [17]:
cpu_processed_filenames = Parallel(n_jobs=2)(
    delayed(process)(f'{DATA_FOLDER}/{img}', save_folder=SAVE_FOLDER)
    for img in tqdm(to_process)
)
cpu_processed_filenames = [f for f in cpu_processed_filenames if f]
print(f'CPU Raw image load complete with {len(cpu_processed_filenames)} loaded')

0it [00:00, ?it/s]

CPU Raw image load complete with 0 loaded





In [18]:
gc.collect()
torch.cuda.empty_cache()

In [19]:
n_saved = len(glob.glob(f'{SAVE_FOLDER}/*.png'))
print(f'Image on disk count : {n_saved}')

Image on disk count : 1500


In [20]:
assert n_saved == len(test_df)

We finished with preprocessing all the dicoms to images. So next, we set-up the dataloading and model

In [21]:

def batch_to_device(batch, device):
    batch_dict = {key: batch[key].to(device) for key in batch}
    return batch_dict


class CustomDataset(Dataset):
    def __init__(self, df, cfg, aug):

        self.cfg = cfg
        self.df = df.copy()
        self.df = self.df[self.df['image_id'].astype(str) != '1942326353']
        self.labels = self.df[self.cfg.classes].values
        self.df["fns"] = self.df['patient_id'].astype(str) + '_' + self.df['image_id'].astype(str) + '.png'
        self.fns = self.df["fns"].astype(str).values
        self.aug = aug
        self.data_folder = cfg.data_folder

    def __getitem__(self, idx):

        label = self.labels[idx]
        img = self.load_one(idx)

        if self.aug:
            img = self.augment(img)

        img = self.normalize_img(img)
        torch_img = torch.tensor(img).float().permute(2,0,1)
        
        feature_dict = {
            "input": torch_img,
            "target": torch.tensor(label),
        }
        return feature_dict

    def __len__(self):
        return len(self.fns)

    def load_one(self, idx):
        path = self.data_folder + self.fns[idx]
        try:
            img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
            shape = img.shape

            if CROP_IMAGE_1st:
                if len(img.shape) == 2:
                    image_c = crop(img)
                    if min(*image_c.shape) > 100:
                        img = image_c
        
            img = cv2.resize(img, (SAVE_SIZE, SAVE_SIZE))
            
            if len(img.shape) == 2:
                img = img[:,:,None]
                    
        except Exception as e:
            print(e)
        return img

    def augment(self, img):
        img = img.astype(np.float32)
        transformed = self.aug(image=img)
        trans_img = transformed["image"]
        return trans_img

    def normalize_img(self, img):
        img = img / 255
        return img


In [22]:
def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1.0 / p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6, p_trainable=False):
        super(GeM, self).__init__()
        if p_trainable:
            self.p = Parameter(torch.ones(1) * p)
        else:
            self.p = p
        self.eps = eps

    def forward(self, x):
        ret = gem(x, p=self.p, eps=self.eps)
        return ret

    def __repr__(self):
        return (self.__class__.__name__  + f"(p={self.p.data.tolist()[0]:.4f},eps={self.eps})")



class Net(nn.Module):

    def __init__(self, cfg: Any):
        super(Net, self).__init__()

        self.cfg = cfg
        self.n_classes = len(cfg.classes)
        self.backbone = timm.create_model(cfg.backbone, 
                                          pretrained=cfg.pretrained, 
                                          num_classes=0, 
                                          global_pool="", 
                                          in_chans=self.cfg.in_channels)
    
        backbone_out = self.backbone.feature_info[-1]['num_chs']

        self.global_pool = GeM(p_trainable=False)
        self.head = torch.nn.Linear(backbone_out, self.n_classes)
        self.loss_fn = nn.BCEWithLogitsLoss()

    def forward(self, batch):

        x = batch['input']

        x = self.backbone(x)
        x = self.global_pool(x)
        x = x[:,:,0,0]

        logits = self.head(x)
        
        
        outputs = {}
        
        
        if self.training:
            loss = self.loss_fn(logits,batch["target"].float())
            outputs['loss'] = loss
        else:
            outputs["logits"] = logits
        
 

        return outputs


In [23]:
def get_dl(test_df, cfg):

    test_ds = CustomDataset(test_df, cfg, cfg.val_aug)
    test_dl = DataLoader(test_ds, shuffle=False, batch_size=cfg.batch_size, num_workers=N_CORES, pin_memory=True)

    return test_dl, batch_to_device

def get_state_dict(sd_fp):
    sd = torch.load(sd_fp, map_location="cpu")['model']
    sd = {k.replace("module.", ""):v for k,v in sd.items()}
    return sd

def get_nets(cfg,state_dicts):

    nets = []

    for i,state_dict in enumerate(state_dicts):
        net = Net(cfg).eval().to(DEVICE)
        print("loading dict")
        sd = get_state_dict(state_dict)
        net.load_state_dict(sd, strict=True)
        nets += [net]
        del sd
        gc.collect()
    return nets

In [24]:
sub_dl, batch_to_device = get_dl(test_df, cfg)

In [25]:
state_dicts = sorted(glob.glob('/kaggle/input/rsna-seresnext50-5fold/check*.pth'))
print(state_dicts)

nets = get_nets(cfg,state_dicts)

['/kaggle/input/rsna-seresnext50-5fold/checkpoint_last_seed298515.pth', '/kaggle/input/rsna-seresnext50-5fold/checkpoint_last_seed334760.pth', '/kaggle/input/rsna-seresnext50-5fold/checkpoint_last_seed607282.pth', '/kaggle/input/rsna-seresnext50-5fold/checkpoint_last_seed758935.pth', '/kaggle/input/rsna-seresnext50-5fold/checkpoint_last_seed779477.pth']
loading dict
loading dict
loading dict
loading dict
loading dict


In [26]:
print(f'Dataloader length : {len(sub_dl.dataset)}')

Dataloader length : 1500


In [27]:
with torch.inference_mode():

    preds = [[] for i in range(len(nets))]
    for batch in tqdm(sub_dl):
        batch = batch_to_device(batch, cfg.device)
        for i, net in enumerate(nets):
            logits = net(batch)['logits'].sigmoid().float().detach().cpu().numpy()
            preds[i] += [logits]
            
preds = np.array([np.concatenate(p, axis=0) for p in preds])
preds = preds.mean(0) #average fold predictions
preds = preds[:,0]
preds.shape

100%|██████████| 188/188 [07:55<00:00,  2.53s/it]


(1500,)

In [28]:
preds.shape

(1500,)

In [29]:
del batch, logits, i, net, nets, batch_to_device, state_dicts, cpu_processed_filenames, ttt, chunk, jpgfiles, pipe, gpu_processed_files, to_process

In [30]:
gc.collect()
torch.cuda.empty_cache()

In [31]:
import tensorflow as tf
from keras_cv_attention_models import convnext

In [32]:
TARGET_HEIGHT = 1344
TARGET_WIDTH = 768
N_CHANNELS = 1
INPUT_SHAPE = (TARGET_HEIGHT, TARGET_WIDTH, N_CHANNELS)
TARGET_HEIGHT_WIDTH_RATIO = TARGET_HEIGHT / TARGET_WIDTH

In [33]:
def normalize(image):
    # Repeat channels to create 3 channel images required by pretrained ConvNextV2 models
    image = tf.repeat(image, repeats=3, axis=3)
    # Cast to float 32
    image = tf.cast(image, tf.float32)
    # Normalize with respect to ImageNet mean/std
    image = tf.keras.applications.imagenet_utils.preprocess_input(image, mode='torch')

    return image

In [34]:
def get_model():
    # Inputs, note the names are equal to the dictionary keys in the dataset
    image = tf.keras.layers.Input(INPUT_SHAPE, name='image', dtype=tf.uint8)

    # Normalize Input
    image_norm = normalize(image)

    # CNN Feature Maps
    x = convnext.ConvNeXtV2Tiny(
        input_shape=(TARGET_HEIGHT, TARGET_WIDTH, 3),
        pretrained=None,
        num_classes=0,
    )(image_norm)

    # Average Pooling BxHxWxC -> BxC
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    # Dropout to prevent Overfitting
    x = tf.keras.layers.Dropout(0.30)(x)
    # Output value between [0, 1] using Sigmoid function
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    # Define model with inputs and outputs
    model = tf.keras.models.Model(inputs=image, outputs=outputs)

    # Load pretrained Model Weights
    model.load_weights('/kaggle/input/rsna-efficientnetv2-training-tensorflow-tpu-ds/model.h5')

    # Set model non-trainable
    model.trainable = False

    # Compile model
    model.compile()

    return model

In [35]:
# Pretrained File Path: '/kaggle/input/sartorius-training-dataset/model.h5'
tf.keras.backend.clear_session()
# enable XLA optmizations
tf.config.optimizer.set_jit(True)

model = get_model()

In [36]:
test = sub_dl.dataset.df
SUBMISSION_ROWS = []
# Iterate over all patient_id/laterality combinations groups
for idx, ((patient_id, laterality), g) in enumerate(tqdm(test.groupby(['patient_id', 'laterality']))):
    # Cancer target is mean of predicted cancer values
    cancer = 0
    # Iterate over all scans in group
    for row_idx, row in g.iterrows():
        # Load Image
        image_id = row['image_id']
        image = cv2.imread(f'/tmp/output/{patient_id}_{image_id}.png', -1)
        
        if CROP_IMAGE_2nd:
            if len(image.shape) == 2:
                image_c = crop(image)
                if min(*image_c.shape) > 100:
                    image = image_c
        
        image = cv2.resize(image, (TARGET_WIDTH, TARGET_HEIGHT))
        
        # Expand to Batch HxW -> 1xHxWx1
        image = np.expand_dims(image, [0, 3])
        # Make Prediction
        cancer += model.predict_on_batch(image).squeeze() / len(g)
                
    # Add Submission Row
    SUBMISSION_ROWS.append({
        'prediction_id': f'{patient_id}_{laterality}',
        'cancer': np.float32(cancer),
    })
    
    if np.random.rand() > 0.99:
        gc.collect()

  keepdims=keepdims, where=where)
  subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)
100%|██████████| 656/656 [03:37<00:00,  3.01it/s]


In [37]:
sub1_std = np.std(preds)
sub1_mean = np.std(preds)
sub2_std = np.std([r['cancer'] for r in SUBMISSION_ROWS])
sub2_mean = np.std([r['cancer'] for r in SUBMISSION_ROWS])

In [38]:
submission_2 = {r['prediction_id']:sub1_std*((r['cancer']-sub2_mean)/sub2_std)+sub1_mean for r in SUBMISSION_ROWS}

In [39]:
patient_id = sub_dl.dataset.df['patient_id'].values
laterality = sub_dl.dataset.df['laterality'].values

ab_rate = 0.7459703039378771
prediction_id = [f'{i}_{j}' for i,j in  zip(patient_id, laterality)]
preds_marged = [sub_1*ab_rate + submission_2[p]*(1.0-ab_rate) for sub_1,p in zip(preds,prediction_id)]

pred_df = pd.DataFrame({'prediction_id': prediction_id, 'cancer_raw': preds_marged})

#aggregate by prediction_id , i.e. by patient_laterality
sub = pred_df.groupby('prediction_id')[['cancer_raw']].agg('mean')

# binarize predictions
th = np.quantile(sub['cancer_raw'].values,0.97935)
sub['cancer'] = (sub['cancer_raw'].values > th).astype(int)

In [40]:
sub[['cancer']].to_csv('submission.csv')

for debugging purpose we can calculate the pF1 score if we infered on the train data by setting RAM_CHECK=True in the beginning

In [41]:
if RAM_CHECK:

    def pfbeta(labels, predictions, beta):
        #official implementation
        y_true_count = 0
        ctp = 0
        cfp = 0

        for idx in range(len(labels)):
            prediction = min(max(predictions[idx], 0), 1)
            if (labels[idx]):
                y_true_count += 1
                ctp += prediction
    #             cfp += 1 - prediction #bugfix
            else:
                cfp += prediction

        beta_squared = beta * beta
        c_precision = ctp / (ctp + cfp)
        c_recall = ctp / y_true_count
        if (c_precision > 0 and c_recall > 0):
            result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
            return result
        else:
            return 0

    #aggregate by prediction_id , i.e. by patient_laterality
    test_df['prediction_id'] = test_df.apply(lambda x: f'{x.patient_id}_{x.laterality}', 1)
    test_df_gr = test_df.groupby('prediction_id')[['cancer']].agg('mean')

    # Sort both the same
    test_df_gr = test_df_gr.loc[sub.index]

    y = test_df_gr['cancer'].values#.astype(np.float32)
    y_pred = sub['cancer'].values

#     print(y.shape, y_pred.shape)

    score = pfbeta(y, y_pred, 1)
    print(th, score)

0.15779858778382835 0.8275862068965518


In [42]:
if RAM_CHECK:
    import optuna
    def opt(trial):
        ab_rate = trial.suggest_uniform('ab_rate', 0.1, 0.9)
        th_rate = trial.suggest_uniform('th_rate', 0.0, 0.1)

        preds_marged = [sub_1*ab_rate + submission_2[p]*(1.0-ab_rate) for sub_1,p in zip(preds,prediction_id)]

        pred_df = pd.DataFrame({'prediction_id': prediction_id, 'cancer_raw': preds_marged})

        #aggregate by prediction_id , i.e. by patient_laterality
        sub = pred_df.groupby('prediction_id')[['cancer_raw']].agg('mean')

        # binarize predictions
        th = np.quantile(sub['cancer_raw'].values,0.9+th_rate)
        sub['cancer'] = (sub['cancer_raw'].values > th).astype(int)

        y = test_df_gr['cancer'].values#.astype(np.float32)
        y_pred = sub['cancer'].values

        score = pfbeta(y, y_pred, 1)

        return score
    study = optuna.create_study(direction='maximize')
    study.optimize(opt, n_trials=100)
    print(study.best_params)

[32m[I 2023-02-24 13:36:20,685][0m A new study created in memory with name: no-name-016b5fda-d488-4576-979e-e7fb87aa8b9e[0m
  after removing the cwd from sys.path.
  """
[32m[I 2023-02-24 13:36:20,701][0m Trial 0 finished with value: 0.5490196078431373 and parameters: {'ab_rate': 0.5884342186079625, 'th_rate': 0.04537116607979849}. Best is trial 0 with value: 0.5490196078431373.[0m
[32m[I 2023-02-24 13:36:20,713][0m Trial 1 finished with value: 0.608695652173913 and parameters: {'ab_rate': 0.5965222439103397, 'th_rate': 0.08796399299860265}. Best is trial 1 with value: 0.608695652173913.[0m
[32m[I 2023-02-24 13:36:20,727][0m Trial 2 finished with value: 0.4545454545454545 and parameters: {'ab_rate': 0.10678073821511883, 'th_rate': 0.09080804825882223}. Best is trial 1 with value: 0.608695652173913.[0m
[32m[I 2023-02-24 13:36:20,738][0m Trial 3 finished with value: 0.42857142857142855 and parameters: {'ab_rate': 0.7526332864794275, 'th_rate': 0.016963158652849263}. Best is

{'ab_rate': 0.765419116504856, 'th_rate': 0.0813489867070476}
