# Preparation of the IF-labeled data set used for development

## Import

### Import terminal H&E + IF images into workspace

In [None]:
import constants as c
from pathlib import Path
from datasets.labeled import import_images


source_dir = c.terminal_he_source_dir
image_names = list(c.terminal2serial.keys())
target_dir = c.scratch_dir / "dataset_208"


import_images(source_dir, target_dir, image_names=image_names)


## Reading and preprocessing

### Read and extract H&E channels at different scale levels

In [None]:
import constants as c
import utils
from datasets.labeled import extract_he


source_dir = c.scratch_dir / "dataset_208"
pyramid_levels = [0, 2, 3]
target_dir = c.scratch_dir / "dataset_208_preprocessed" / "he"


image_paths = utils.list_files(source_dir, file_extension=".ome.tiff")
for level in pyramid_levels:
    level_dir = target_dir / f"level_{level}"
    level_dir.mkdir(parents=True)
    for image_path in image_paths:
        save_path = level_dir / (image_path.name + ".npy")
        extract_he(image_path, save_path, pyramid_level=level)


### Read and extract CKSOX channel at full scale

In [None]:
import constants as c
import utils
from datasets.labeled import extract_cksox


source_dir = c.scratch_dir / "dataset_208"
target_dir = c.scratch_dir / "dataset_208_preprocessed" / "cksox"


image_paths = utils.list_files(source_dir, file_extension=".ome.tiff")
target_dir.mkdir()
for image_path in image_paths:
    save_path = target_dir / (image_path.name + ".npy")
    extract_cksox(image_path, save_path)


### Binarize CKSOX channel

In [None]:
import constants as c
from preprocessing import threshold_images


source_dir = c.scratch_dir / "dataset_208_preprocessed" / "cksox"
threshold = 1000
target_dir = (
    c.scratch_dir / "dataset_208_preprocessed" / "cksox" / f"masks_thr_{threshold}"
)


threshold_images(source_dir, target_dir, threshold=threshold)


### Fill holes in binarized masks

In [None]:
import constants as c
from preprocessing import fill_holes_in_masks


source_dir = (
    c.scratch_dir / "dataset_208_preprocessed" / "cksox" / f"masks_thr_{threshold}"
)
dia_BR = 15
dia_LU = 50
diameters = {
    "H2021-192_exp2_s01_HP-224363BR.ome.tiff": dia_BR,
    "H2021-192_exp2_s02_HP-224470BR.ome.tiff": dia_BR,
    "H2021-192_exp2_s03_HP-224551BR.ome.tiff": dia_BR,
    "H2021-192_exp2_s04_HP-224388BR.ome.tiff": dia_BR,
    "H2021-192_exp2_s05_HP-82163LU.ome.tiff": dia_LU,
    "H2021-192_exp2_s06_HP-70699LU.ome.tiff": dia_LU,
    "H2021-192_exp2_s07_HP-58283LU.ome.tiff": dia_LU,
    "H2021-192_exp2_s08_HP-58289LU.ome.tiff": dia_LU,
    "H2021-192_exp2_s09_HP-58982LU.ome.tiff": dia_LU,
}
target_dir = (
    c.scratch_dir
    / "dataset_208_preprocessed"
    / "cksox"
    / f"masks_thr_{threshold}"
    / f"holes_filled_dia_BR_{dia_BR}_LU_{dia_LU}"
)


fill_holes_in_masks(source_dir, target_dir, diameters=diameters)


## Tiling #1
For the models from Schmitz et al. (2021).

### Split the H&E images at the different scale levels into tiles

In [None]:
import constants as c
from preprocessing import tile_images


source_dir = c.scratch_dir / "dataset_208_preprocessed" / "he"
pyramid_levels = [0, 2, 3]
tile_shapes = [(512, 512), (512, 512), (512, 512)]
stride = (512, 512)
target_dir = c.scratch_dir / "dataset_208_tiled" / "he"


for level, tile_shape in zip(pyramid_levels, tile_shapes):
    level_dir = target_dir / f"level_{level}"
    level_dir.mkdir(parents=True, exist_ok=True)
    overlap = (tile_shape[0] - stride[0]) // 2, (tile_shape[1] - stride[1]) // 2
    tiling_dir = (
        level_dir / f"shape_{stride[0]}_{stride[1]}_overlap_{overlap[0]}_{overlap[1]}"
    )

    anchor_y = stride[0] // 2 ** (level + 1)
    anchor_x = stride[1] // 2 ** (level + 1)
    stride_y = stride[0] // 2**level
    stride_x = stride[1] // 2**level

    tile_images(
        source_dir / f"level_{level}",
        tiling_dir,
        tile_shape,
        (anchor_y, anchor_x),
        (stride_y, stride_x),
    )


### Split the full-resolution CKSOX masks into tiles

In [None]:
import constants as c
from preprocessing import tile_images


source_dir = (
    c.scratch_dir
    / "dataset_208_preprocessed"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
)
tile_shape = (512, 512)
stride = (512, 512)
target_dir = (
    c.scratch_dir
    / "dataset_208_tiled"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
)


level_dir = target_dir / "level_0"
level_dir.mkdir(parents=True, exist_ok=True)
overlap = (tile_shape[0] - stride[0]) // 2, (tile_shape[1] - stride[1]) // 2
tiling_dir = (
    level_dir / f"shape_{stride[0]}_{stride[1]}_overlap_{overlap[0]}_{overlap[1]}"
)

anchor_y, anchor_x = stride[0] // 2, stride[1] // 2
stride_y, stride_x = stride

tile_images(
    source_dir, tiling_dir, tile_shape, (anchor_y, anchor_x), (stride_y, stride_x)
)


## Index structures for guidance of tile sampling at training time

### Create lookup tables for tissue foreground-to-background ratio

In [None]:
import constants as c
import numpy as np
from datasets.labeled import extract_tissue_fg
from preprocessing import compute_tile_statistics


tilings_dir = (
    c.scratch_dir / "dataset_208_tiled" / "he" / "level_0" / "shape_512_512_overlap_0_0"
)


def compute_tissue_fg_ratio(tile: np.ndarray) -> float:
    assert np.issubdtype(tile.dtype, np.floating)
    tissue_mask = extract_tissue_fg(tile)
    return tissue_mask.sum() / float(tissue_mask.size)


compute_tile_statistics(tilings_dir, "tissue_fg_ratios", compute_tissue_fg_ratio)


### Create lookup tables for label foreground-to-background ratio

In [None]:
import constants as c
import numpy as np
from preprocessing import compute_tile_statistics


tilings_dir = (
    c.scratch_dir
    / "dataset_208_tiled"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
    / "level_0"
    / "shape_512_512_overlap_0_0"
)


def compute_label_fg_ratio(tile: np.ndarray) -> float:
    assert tile.dtype == "bool"
    return tile.sum() / float(tile.size)


compute_tile_statistics(tilings_dir, "label_fg_ratios", compute_label_fg_ratio)


## Tiling #2
For the model from Bulten et al. (2019).

### Split the full-resolution H&E images into tiles

In [None]:
import constants as c
from preprocessing import tile_images


source_dir = c.scratch_dir / "dataset_208_preprocessed" / "he"
tile_shape = (1024, 1024)
stride = (1024, 1024)
target_dir = c.scratch_dir / "dataset_208_tiled" / "he"


level_dir = target_dir / "level_0"
level_dir.mkdir(parents=True, exist_ok=True)
overlap = (tile_shape[0] - stride[0]) // 2, (tile_shape[1] - stride[1]) // 2
tiling_dir = (
    level_dir / f"shape_{stride[0]}_{stride[1]}_overlap_{overlap[0]}_{overlap[1]}"
)

anchor_y, anchor_x = stride[0] // 2, stride[1] // 2
stride_y, stride_x = stride

tile_images(
    source_dir / "level_0",
    tiling_dir,
    tile_shape,
    (anchor_y, anchor_x),
    (stride_y, stride_x),
)


### Split the full-resolution CKSOX masks into tiles

In [None]:
import constants as c
from preprocessing import tile_images


source_dir = (
    c.scratch_dir
    / "dataset_208_preprocessed"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
)
tile_shape = (1024, 1024)
stride = (1024, 1024)
target_dir = (
    c.scratch_dir
    / "dataset_208_tiled"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
)


level_dir = target_dir / "level_0"
level_dir.mkdir(parents=True, exist_ok=True)
overlap = (tile_shape[0] - stride[0]) // 2, (tile_shape[1] - stride[1]) // 2
tiling_dir = (
    level_dir / f"shape_{stride[0]}_{stride[1]}_overlap_{overlap[0]}_{overlap[1]}"
)

anchor_y, anchor_x = stride[0] // 2, stride[1] // 2
stride_y, stride_x = stride

tile_images(
    source_dir, tiling_dir, tile_shape, (anchor_y, anchor_x), (stride_y, stride_x)
)


## Index structures for guidance of tile sampling

### Create lookup tables for tissue foreground-to-background ratio

In [None]:
import constants as c
import numpy as np
from datasets.labeled import extract_tissue_fg
from preprocessing import compute_tile_statistics


tilings_dir = (
    c.scratch_dir
    / "dataset_208_tiled"
    / "he"
    / "level_0"
    / "shape_1024_1024_overlap_0_0"
)


def compute_tissue_fg_ratio(tile: np.ndarray) -> float:
    assert np.issubdtype(tile.dtype, np.floating)
    tissue_mask = extract_tissue_fg(tile)
    return tissue_mask.sum() / float(tissue_mask.size)


compute_tile_statistics(tilings_dir, "tissue_fg_ratios", compute_tissue_fg_ratio)


### Create lookup tables for label foreground-to-background ratio

In [None]:
import constants as c
import numpy as np
from preprocessing import compute_tile_statistics


tilings_dir = (
    c.scratch_dir
    / "dataset_208_tiled"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
    / "level_0"
    / "shape_1024_1024_overlap_0_0"
)


def compute_label_fg_ratio(tile: np.ndarray) -> float:
    assert tile.dtype == "bool"
    return tile.sum() / float(tile.size)


compute_tile_statistics(tilings_dir, "label_fg_ratios", compute_label_fg_ratio)


## Tiling #3
For some stain transfer experiments.

### Split the full-resolution H&E images into tiles

In [None]:
import constants as c
from preprocessing import tile_images


source_dir = c.scratch_dir / "dataset_208_preprocessed" / "he"
pyramid_levels = [0]
tile_shapes = [(256, 256), (256, 256), (256, 256)]
stride = (256, 256)
target_dir = c.scratch_dir / "dataset_208_tiled" / "he"


for level, tile_shape in zip(pyramid_levels, tile_shapes):
    level_dir = target_dir / f"level_{level}"
    level_dir.mkdir(parents=True, exist_ok=True)
    overlap = (tile_shape[0] - stride[0]) // 2, (tile_shape[1] - stride[1]) // 2
    tiling_dir = (
        level_dir / f"shape_{stride[0]}_{stride[1]}_overlap_{overlap[0]}_{overlap[1]}"
    )

    anchor_y = stride[0] // 2 ** (level + 1)
    anchor_x = stride[1] // 2 ** (level + 1)
    stride_y = stride[0] // 2**level
    stride_x = stride[1] // 2**level

    tile_images(
        source_dir / f"level_{level}",
        tiling_dir,
        tile_shape,
        (anchor_y, anchor_x),
        (stride_y, stride_x),
    )


### Split the full-resolution CKSOX masks into tiles

In [None]:
import constants as c
from preprocessing import tile_images


source_dir = (
    c.scratch_dir
    / "dataset_208_preprocessed"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
)
tile_shape = (256, 256)
stride = (256, 256)
target_dir = (
    c.scratch_dir
    / "dataset_208_tiled"
    / "cksox"
    / "masks_thr_1000"
    / "holes_filled_dia_BR_15_LU_50"
)


level_dir = target_dir / "level_0"
level_dir.mkdir(parents=True, exist_ok=True)
overlap = (tile_shape[0] - stride[0]) // 2, (tile_shape[1] - stride[1]) // 2
tiling_dir = (
    level_dir / f"shape_{stride[0]}_{stride[1]}_overlap_{overlap[0]}_{overlap[1]}"
)

anchor_y, anchor_x = stride[0] // 2, stride[1] // 2
stride_y, stride_x = stride

tile_images(
    source_dir, tiling_dir, tile_shape, (anchor_y, anchor_x), (stride_y, stride_x)
)


## Index structures for guidance of tile sampling

### Create lookup tables for tissue foreground-to-background ratio

In [None]:
import constants as c
import numpy as np
from datasets.labeled import extract_tissue_fg
from preprocessing import compute_tile_statistics


tilings_dir = (
    c.scratch_dir / "dataset_208_tiled" / "he" / "level_0" / "shape_256_256_overlap_0_0"
)


def compute_tissue_fg_ratio(tile: np.ndarray) -> float:
    assert np.issubdtype(tile.dtype, np.floating)
    tissue_mask = extract_tissue_fg(tile)
    return tissue_mask.sum() / float(tissue_mask.size)


compute_tile_statistics(tilings_dir, "tissue_fg_ratios", compute_tissue_fg_ratio)


## Tiling #4

For stain transfer (inference).

In [None]:
import constants as c
from preprocessing import tile_images


source_dir = c.scratch_dir / "dataset_208_preprocessed" / "he"
pyramid_levels = [0]
tile_shapes = [(2048, 2048), (2048, 2048), (2048, 2048)]
stride = (512, 512)
target_dir = c.scratch_dir / "dataset_208_tiled" / "he"


for level, tile_shape in zip(pyramid_levels, tile_shapes):
    level_dir = target_dir / f"level_{level}"
    level_dir.mkdir(parents=True, exist_ok=True)
    overlap = (tile_shape[0] - stride[0]) // 2, (tile_shape[1] - stride[1]) // 2
    tiling_dir = (
        level_dir / f"shape_{stride[0]}_{stride[1]}_overlap_{overlap[0]}_{overlap[1]}"
    )

    anchor_y = stride[0] // 2 ** (level + 1)
    anchor_x = stride[1] // 2 ** (level + 1)
    stride_y = stride[0] // 2**level
    stride_x = stride[1] // 2**level

    tile_images(
        source_dir / f"level_{level}",
        tiling_dir,
        tile_shape,
        (anchor_y, anchor_x),
        (stride_y, stride_x),
    )