# ###Outline image here###

# Table of contents

1. [Libraries & Environment](#Libraries-&-Environment)
1. [Data Preprocessing](#Data-Preprocessing)
    1. Tiling
    1. Tumor detection
1. [Training Deep Learning Models](#Training-Deep-Learning-Models)
    1. Data splitting
    1. Model and data loading
    1. Common hardware bottlenecks
    1. Real-time performance monitoring
    1. Misc.
1. [Evaluating Performance](#Evaluating-Performance)
    1. Patient-level vs. tile-level evaluation
    1. AUROC vs. accuracy
    1. On improving performance
1. [Visualizing Results](#Visualizing-Results)
    1. TODO

# Libraries & Environment

The base environment that I use can be installed using the create_conda_env.sh bash script.

NB: As of June 2021, when installing OpenSlide on Linux, it will not work correctly with some image types due to a broken dependency. (I've noticed this problem for .mrxs images in particular) In order to repair this issue, you can install version 0.40.0 of the pixman library. (Installed automatically in the create_conda_env.sh script) If you notice the slide images look like like the image below, or throw an error when you view them, try this solution.

TODO: insert image

In [5]:
import numpy as np
from openslide import OpenSlide, OpenSlideError
from pathlib import Path
from PIL import Image
from scipy import ndimage
import shutil
import tqdm
import traceback
import warnings

# Pytorch imports
import torch

# Custom imports
from library.MacenkoNormalizer import MacenkoNormalizer

# Data Preprocessing

In order to prepare the WSI images for deep learning training and inference, a number of preprocessing steps must be applied:

1. Images are broken into many small tiles (usually 256x256 microns)
1. Tiles are filtered to exclude non-tissue background regions
1. Tiles are Macenko-normalized
1. Tiles are filtered to exclude non-tumorous tissue regions

These steps are laid out in example code below. However, when applying this pipeline at scale, the implementation should include multiprocessing and/or CuPy (for Macenko normalization) as these additions provide enormous speedups.

In [6]:
MICRONS_PER_TILE = 256.

# Initialize the Macenko Normalizer
reference_img = np.array(Image.open('library/macenko_reference_img.png').convert('RGB'))
normalizer = MacenkoNormalizer()
normalizer.fit(reference_img)

# Find all WSIs and check for errors opening the file or finding the microns-per-pixel values 
base_path = Path('WSIs')
wsi_paths = base_path.rglob('*.svs')
save_paths = []
wsi_paths_to_normalize = []
total_num_tiles = 0
for wsi_path in wsi_paths:
    try:
        with OpenSlide(str(wsi_path)) as wsi:
            sub_path = Path(str(wsi_path)[len(str(base_path)) + 1:-len(wsi_path.suffix)])
            save_path = Path('tiled_WSIs') / sub_path

            if (save_path / 'Finished.txt').exists():
                print('Ignoring {}, as it has already been processed.'.format(wsi_path))
            else:
                pixels_per_tile_x = int(MICRONS_PER_TILE / float(wsi.properties['openslide.mpp-x']))
                pixels_per_tile_y = int(MICRONS_PER_TILE / float(wsi.properties['openslide.mpp-y']))
                wsi_paths_to_normalize.append(wsi_path)
                save_paths.append(save_path)
                save_path.mkdir(parents=True, exist_ok=True)
                total_num_tiles += (
                        len(range(pixels_per_tile_x, wsi.dimensions[0] - pixels_per_tile_x, pixels_per_tile_x)) *
                        len(range(pixels_per_tile_y, wsi.dimensions[1] - pixels_per_tile_y, pixels_per_tile_y)))
    except OpenSlideError:
        print('Ignoring {}, as it cannot be opened by OpenSlide.'.format(wsi_path))
    except KeyError:
        print('Ignoring {}, as it does not have a defined microns-per-pixel value'.format(wsi_path))

print(f'Masking and normalizing {total_num_tiles} tiles from {len(wsi_paths_to_normalize)} whole slide images.')

Masking and normalizing 32853 tiles from 4 whole slide images.


This function, given a whole slide image path and target save path, masks and normalizes all tissue tiles and then saves them into pngs.

In [7]:
def mask_and_normalize_wsi(wsi_path, save_path, pbar):
    num_tiles_kept = 0
    try:
        with OpenSlide(str(wsi_path)) as wsi:
            pptx = int(MICRONS_PER_TILE / float(wsi.properties['openslide.mpp-x']))
            ppty = int(MICRONS_PER_TILE / float(wsi.properties['openslide.mpp-y']))
            # Leave out border of image
            for x in range(pptx, wsi.dimensions[0] - pptx, pptx):
                for y in range(ppty, wsi.dimensions[1] - ppty, ppty):
                    tile = wsi.read_region((x, y), level=0, size=(pptx, ppty)).convert('RGB')
                    # Mask away all-white and all-black background regions
                    mask = tile.convert(mode='L').point(lut=lambda p: 220 > p > 10, mode='1')
                    mask = ndimage.binary_fill_holes(mask)
                    if np.sum(mask).astype(float) / mask.size > 0.5:
                        with warnings.catch_warnings():
                            warnings.simplefilter('ignore')
                            try:
                                # Normalize the tile
                                tile = normalizer.transform(np.array(tile))
                                tile = Image.fromarray(tile)
                                # Resize the image to 224x224
                                tile = tile.resize((224, 224), Image.LANCZOS)
                                num_tiles_kept += 1
                                filename = f'{wsi_path.stem}__x{x}_y{y}_dx{pptx}_dy{ppty}.png'
                                tile.save(save_path / filename, format='PNG')
                            except np.linalg.LinAlgError:
                                pass
                    pbar.update()
    except OpenSlideError as ex:
        print('\nUnable to process {}:'.format(wsi_path))
        print(''.join(traceback.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__)))
        shutil.rmtree(save_path)
        return 0

    with open(save_path / 'Finished.txt', 'w+') as file:
        file.write('Kept and processed {} tiles.'.format(num_tiles_kept))
    return num_tiles_kept

In [10]:
assert len(wsi_paths_to_normalize) == len(save_paths)
with tqdm.tqdm(total=total_num_tiles) as pbar:
    num_tiles_kept_results = []
    for wsi_path, save_path in zip(wsi_paths_to_normalize, save_paths):
        num_tiles_kept_results.append(mask_and_normalize_wsi(wsi_path, save_path, pbar))
# Wait a moment for pbar to close
time.sleep(0.25)

for wsi_path, save_path, num_tiles_kept in enumerate(zip(wsi_paths_to_normalize, save_paths, num_tiles_kept_results)):
    print(f'{num_tiles_kept} tiles from {wsi_path} saved to {save_path}')
print(f'{sum(num_tiles_kept)} of {total_num_tiles} were saved and normalized')

100%|██████████| 32853/32853 [1:48:05<00:00,  5.07it/s]  


NameError: name 'time' is not defined

In [None]:
# For each image, check for tumor status and then log tumor status to CSV
# Then do the same for train/validation/test splitting
"""
print('Loading images for tumor detection')
img_dataset = datasets.ImageFolder(destination_folder,
                                   transforms.Compose([
                                       transforms.Resize(224),
                                       transforms.ToTensor()
                                   ]))
img_dataloader = data.DataLoader(img_dataset,
                                 batch_size=512,
                                 num_workers=8,
                                 shuffle=False,
                                 pin_memory=True)
tumor_detection_model = load_saved_model_for_inference(
    '/home/pressmi/Desktop2/Current_selected_models/tumor_detection_1x/resnet18_tumor_detection_exp9.pt',
    #'/home/pressmi/Desktop2/Current_selected_models/tumor_detection_2x/exp2_resnet18_acc0.973_loss0.067_rocauc0.999.pt',
    num_classes=2,
).to(device)

print('Getting tumor predictions for {} tiles in {} batches.'.format(
    len(img_dataset),
    len(img_dataloader)))
all_preds = []
prog_bar = ProgressBar(len(img_dataloader))
with torch.no_grad():
    for i, (inputs, _) in enumerate(img_dataloader):
        inputs = inputs.to(device, non_blocking=True)
        outputs = tumor_detection_model(inputs).cpu()
        all_preds.append(outputs)
        prog_bar.animate()
prog_bar.close()
all_preds = torch.cat(all_preds, dim=0)

non_tumorous_tiles = (all_preds.argmax(dim=1) == 0).flatten()
print('Deleting {} tiles that do not contain tumorous tissue'.format(non_tumorous_tiles.sum()))

prog_bar = ProgressBar(non_tumorous_tiles.sum(), print_freq=100)
for i, (image_path, _) in enumerate(img_dataset.samples):
    if non_tumorous_tiles[i] == True:
        os.remove(image_path)
        prog_bar.animate()
prog_bar.close()
"""

# Training Deep Learning Models

# Evaluating Performance

In [None]:
print('Testing')