# Preparation of the labeled and independent data sets for stain transfer

## Additional index structures for guidance of tile sampling at training time

In [None]:
import constants as c
import datasets.labeled
import datasets.independent
from preprocessing import compute_tile_tissue_fg_overlap


serial_terminal_tilings_dir_pairs = [
    (
        # Serial 256x256
        c.scratch_dir / "serial_he_tiled" / "level_0" / "shape_256_256_overlap_0_0",
        # Terminal 256x256
        c.scratch_dir
        / "dataset_208_tiled"
        / "he"
        / "level_0"
        / "shape_256_256_overlap_0_0",
    ),
    (
        # Serial 512x512
        c.scratch_dir / "serial_he_tiled" / "level_0" / "shape_512_512_overlap_0_0",
        # Terminal 512x512
        c.scratch_dir
        / "dataset_208_tiled"
        / "he"
        / "level_0"
        / "shape_512_512_overlap_0_0",
    ),
]

for serial_tilings_dir, terminal_tilings_dir in serial_terminal_tilings_dir_pairs:
    # Note: Produces NumPy runtime warnings due to NaNs when there is no tissue foreground in a pair of tiles (divison
    # by zero). This is expected, NaNs are replaced by an overlap ratio of zero.
    compute_tile_tissue_fg_overlap(
        serial_tilings_dir,
        terminal_tilings_dir,
        c.serial2terminal,
        datasets.independent.extract_tissue_fg,
        datasets.labeled.extract_tissue_fg,
    )


In [None]:
import constants as c
from preprocessing import compute_tile_histograms


tilings_dirs = [
    # Serial 256x256
    c.scratch_dir / "serial_he_tiled" / "level_0" / "shape_256_256_overlap_0_0",
    # Terminal 256x256
    c.scratch_dir
    / "dataset_208_tiled"
    / "he"
    / "level_0"
    / "shape_256_256_overlap_0_0",
    # Serial 512x512
    c.scratch_dir / "serial_he_tiled" / "level_0" / "shape_512_512_overlap_0_0",
    # Terminal 512x512
    c.scratch_dir
    / "dataset_208_tiled"
    / "he"
    / "level_0"
    / "shape_512_512_overlap_0_0",
]


for tilings_dir in tilings_dirs:
    compute_tile_histograms(tilings_dir)


## Extract independent data set's tissue foregrounds

Currently used to restrict the computation of segmentation performance metrics on stain-transferred images to their tissue foreground regions (i.e. excluding glass slide).
Ideally, this step and others that extract tissue foreground (see e.g. above) should be merged for efficiency reasons.

In [None]:
import constants as c
import numpy as np
from datasets.independent import extract_tissue_fg
from logging import info
from utils import list_files


source_tilings_dir = (
    c.scratch_dir / "serial_he_tiled" / "level_0" / "shape_512_512_overlap_0_0"
)
target_tilings_dir = (
    c.scratch_dir
    / "serial_he_tissue_masks_tiled"
    / "level_0"
    / "shape_512_512_overlap_0_0"
)


source_tiling_dirs = list_files(source_tilings_dir, file_pattern="*/")
info(
    f"Extracting tissue masks from {len(source_tiling_dirs)} tilings in directory {source_tilings_dir}."
)
for source_tiling_dir in source_tiling_dirs:
    source_tile_paths = list_files(source_tiling_dir, file_extension=".npy")
    info(
        f"Extracting tissue masks from tiling {source_tiling_dir.name} consisting of {len(source_tile_paths)} tiles."
    )
    target_tiling_dir = target_tilings_dir / source_tiling_dir.name
    info(f"Tissue masks will be saved to {target_tiling_dir}.")
    target_tiling_dir.mkdir(parents=True)
    for tile_path in source_tile_paths:
        tile = np.load(tile_path)
        tissue_mask = extract_tissue_fg(tile)
        tissue_mask_save_path = target_tiling_dir / tile_path.name
        np.save(tissue_mask_save_path, tissue_mask)
info("Extracting tissue masks done.")
