Connected to provPath (Python 3.1.-1)

In [1]:
import sys

if __name__ == '__main__':
    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../..')))

import functools
import logging
import shutil
import tempfile
import traceback
import warnings
from pathlib import Path
from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
import PIL
from matplotlib import collections, patches, pyplot as plt
from monai.data import Dataset
from monai.data.wsi_reader import WSIReader
from openslide import OpenSlide
from tqdm import tqdm
from gigapath.preprocessing.data import tiling
from gigapath.preprocessing.data.foreground_segmentation import LoadROId, segment_foreground



def select_tiles(foreground_mask: np.ndarray, occupancy_threshold: float) \
        -> Tuple[np.ndarray, np.ndarray]:
    """Exclude tiles that are mostly background based on estimated occupancy.

    :param foreground_mask: Boolean array of shape (*, H, W).
    :param occupancy_threshold: Tiles with lower occupancy (between 0 and 1) will be discarded.
    :return: A tuple containing which tiles were selected and the estimated occupancies. These will
    be boolean and float arrays of shape (*,), or scalars if `foreground_mask` is a single tile.
    """
    if occupancy_threshold < 0. or occupancy_threshold > 1.:
        raise ValueError("Tile occupancy threshold must be between 0 and 1")
    occupancy = foreground_mask.mean(axis=(-2, -1), dtype=np.float16)
    return (occupancy > occupancy_threshold).squeeze(), occupancy.squeeze()  # type: ignore


def get_tile_descriptor(tile_location: Sequence[int]) -> str:
    """Format the XY tile coordinates into a tile descriptor."""
    return f"{tile_location[0]:05d}x_{tile_location[1]:05d}y"


def get_tile_id(slide_id: str, tile_location: Sequence[int]) -> str:
    """Format the slide ID and XY tile coordinates into a unique tile ID."""
    return f"{slide_id}.{get_tile_descriptor(tile_location)}"


def save_image(array_chw: np.ndarray, path: Path) -> PIL.Image:
    """Save an image array in (C, H, W) format to disk."""
    path.parent.mkdir(parents=True, exist_ok=True)
    array_hwc = np.moveaxis(array_chw, 0, -1).astype(np.uint8).squeeze()
    pil_image = PIL.Image.fromarray(array_hwc)
    pil_image.convert('RGB').save(path)
    return pil_image


def check_empty_tiles(tiles: np.ndarray, std_th: int = 5, extreme_value_portion_th: float = 0.5) -> np.ndarray:
    """Determine if a tile is empty. Hacky.

    :param tiles: The tile array in (N, C, H, W) format.
    :return: Boolean array of shape (N,).
    """
    # calculate standard deviation of rgb image
    b, c, h, w = tiles.shape
    flattned_tiles = tiles.reshape(b, c, h * w)

    std_rgb = flattned_tiles[:, :, :].std(axis=2)
    std_rgb_mean = std_rgb.mean(axis=1)

    low_std_mask = std_rgb_mean < std_th

    # count 0 pixel values
    extreme_value_count = ((flattned_tiles == 0)).sum(axis=2)
    extreme_value_proportion = extreme_value_count / (h * w)
    extreme_value_mask = extreme_value_proportion.max(axis=1) > extreme_value_portion_th

    return low_std_mask | extreme_value_mask


def generate_tiles(slide_image: np.ndarray, tile_size: int, foreground_threshold: float,
                   occupancy_threshold: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]:
    """Split the foreground of an input slide image into tiles.

    :param slide_image: The RGB image array in (C, H, W) format.
    :param tile_size: Lateral dimensions of each tile, in pixels.
    :param foreground_threshold: Luminance threshold (0 to 255) to determine tile occupancy.
    :param occupancy_threshold: Threshold (between 0 and 1) to determine empty tiles to discard.
    :return: A tuple containing the image tiles (N, C, H, W), tile coordinates (N, 2), occupancies
    (N,), and total number of discarded empty tiles.
    """
    image_tiles, tile_locations = tiling.tile_array_2d(slide_image, tile_size=tile_size,
                                                       constant_values=255)
    logging.info(f"image_tiles.shape: {image_tiles.shape}, dtype: {image_tiles.dtype}")
    logging.info(f"Tiled {slide_image.shape} to {image_tiles.shape}")
    foreground_mask, _ = segment_foreground(image_tiles, foreground_threshold)
    selected, occupancies = select_tiles(foreground_mask, occupancy_threshold)
    n_discarded = (~selected).sum()
    logging.info(f"Percentage tiles discarded: {n_discarded / len(selected) * 100:.2f}")

    # FIXME: this uses too much memory
    # empty_tile_bool_mask = check_empty_tiles(image_tiles)
    # selected = selected & (~empty_tile_bool_mask)
    # n_discarded = (~selected).sum()
    # logging.info(f"Percentage tiles discarded after filtering empty tiles: {n_discarded / len(selected) * 100:.2f}")

    # logging.info(f"Before filtering: min y: {tile_locations[:, 0].min()}, max y: {tile_locations[:, 0].max()}, min x: {tile_locations[:, 1].min()}, max x: {tile_locations[:, 1].max()}")

    image_tiles = image_tiles[selected]
    tile_locations = tile_locations[selected]
    occupancies = occupancies[selected]

    if len(tile_locations) == 0:
        logging.warn("No tiles selected")
    else:
        logging.info(f"After filtering: min y: {tile_locations[:, 0].min()}, max y: {tile_locations[:, 0].max()}, min x: {tile_locations[:, 1].min()}, max x: {tile_locations[:, 1].max()}")

    return image_tiles, tile_locations, occupancies, n_discarded


def get_tile_info(sample: Dict["SlideKey", Any], occupancy: float, tile_location: Sequence[int],
                  rel_slide_dir: Path) -> Dict["TileKey", Any]:
    """Map slide information and tiling outputs into tile-specific information dictionary.

    :param sample: Slide dictionary.
    :param occupancy: Estimated tile foreground occuppancy.
    :param tile_location: Tile XY coordinates.
    :param rel_slide_dir: Directory where tiles are saved, relative to dataset root.
    :return: Tile information dictionary.
    """
    slide_id = sample["slide_id"]
    descriptor = get_tile_descriptor(tile_location)
    rel_image_path = f"{rel_slide_dir}/{descriptor}.png"

    tile_info = {
        "slide_id": slide_id,
        "tile_id": get_tile_id(slide_id, tile_location),
        "image": rel_image_path,
        "label": sample.get("label", None),
        "tile_x": tile_location[0],
        "tile_y": tile_location[1],
        "occupancy": occupancy,
        "metadata": {"slide_" + key: value for key, value in sample["metadata"].items()}
    }

    return tile_info


def format_csv_row(tile_info: Dict["TileKey", Any], keys_to_save: Iterable["TileKey"],
                   metadata_keys: Iterable[str]) -> str:
    """Format tile information dictionary as a row to write to a dataset CSV tile.

    :param tile_info: Tile information dictionary.
    :param keys_to_save: Which main keys to include in the row, and in which order.
    :param metadata_keys: Likewise for metadata keys.
    :return: The formatted CSV row.
    """
    tile_slide_metadata = tile_info.pop("metadata")
    fields = [str(tile_info[key]) for key in keys_to_save]
    fields.extend(str(tile_slide_metadata[key]) for key in metadata_keys)
    dataset_row = ','.join(fields)
    return dataset_row


def load_image_dict(sample: dict, level: int, margin: int, foreground_threshold: Optional[float] = None) -> Dict["SlideKey", Any]:
    """
    Load image from metadata dictionary
    :param sample: dict describing image metadata. Example:
        {'image_id': ['1ca999adbbc948e69783686e5b5414e4'],
        'image': ['/tmp/datasets/PANDA/train_images/1ca999adbbc948e69783686e5b5414e4.tiff'],
         'mask': ['/tmp/datasets/PANDA/train_label_masks/1ca999adbbc948e69783686e5b5414e4_mask.tiff'],
         'data_provider': ['karolinska'],
         'isup_grade': tensor([0]),
         'gleason_score': ['0+0']}
    :param level: level of resolution to be loaded
    :param margin: margin to be included
    :return: a dict containing the image data and metadata
    """
    loader = LoadROId(WSIReader(backend="OpenSlide"), level=level, margin=margin,
                      foreground_threshold=foreground_threshold)
    img = loader(sample)

    return img


def save_thumbnail(slide_path, output_path, size_target=1024):
    with OpenSlide(str(slide_path)) as openslide_obj:
        scale = size_target / max(openslide_obj.dimensions)
        thumbnail = openslide_obj.get_thumbnail([int(m * scale) for m in openslide_obj.dimensions])
        thumbnail.save(output_path)
        logging.info(f"Saving thumbnail {output_path}, shape {thumbnail.size}")


def visualize_tile_locations(slide_sample, output_path, tile_info_list, tile_size, origin_offset):
    # check slide_image size. should be thumbnail size?
    slide_image = slide_sample["image"]
    downscale_factor = slide_sample["scale"]

    fig, ax = plt.subplots()
    ax.imshow(slide_image.transpose(1, 2, 0))
    rects = []
    for tile_info in tile_info_list:
        # change coordinate to the current level from level-0
        # tile location is in the original image cooridnate, while the slide image is after selecting ROI
        xy = ((tile_info["tile_x"] - origin_offset[0]) / downscale_factor,
              (tile_info["tile_y"] - origin_offset[1]) / downscale_factor)
        rects.append(patches.Rectangle(xy, tile_size, tile_size))
    pc = collections.PatchCollection(rects, match_original=True, alpha=0.5, edgecolor="black")
    pc.set_array(np.array([100] * len(tile_info_list)))
    ax.add_collection(pc)
    fig.savefig(output_path)
    plt.close()


def is_already_processed(output_tiles_dir):
    if not output_tiles_dir.exists():
        return False

    if len(list(output_tiles_dir.glob("*.png"))) == 0:
        return False

    dataset_csv_path = output_tiles_dir / "dataset.csv"
    try:
        df = pd.read_csv(dataset_csv_path)
    except:
        return False

    return len(df) > 0


def process_slide(sample: Dict["SlideKey", Any], level: int, margin: int, tile_size: int,
                  foreground_threshold: Optional[float], occupancy_threshold: float, output_dir: Path,
                  thumbnail_dir: Path,
                  tile_progress: bool = False) -> str:
    """Load and process a slide, saving tile images and information to a CSV file.

    :param sample: Slide information dictionary, returned by the input slide dataset.
    :param level: Magnification level at which to process the slide.
    :param margin: Margin around the foreground bounding box, in pixels at lowest resolution.
    :param tile_size: Lateral dimensions of each tile, in pixels.
    :param foreground_threshold: Luminance threshold (0 to 255) to determine tile occupancy.
    If `None` (default), an optimal threshold will be estimated automatically.
    :param occupancy_threshold: Threshold (between 0 and 1) to determine empty tiles to discard.
    :param output_dir: Root directory for the output dataset; outputs for a single slide will be
    saved inside `output_dir/slide_id/`.
    :param tile_progress: Whether to display a progress bar in the terminal.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    thumbnail_dir.mkdir(parents=True, exist_ok=True)
    slide_metadata: Dict[str, Any] = sample["metadata"]
    keys_to_save = ("slide_id", "tile_id", "image", "label",
                    "tile_x", "tile_y", "occupancy")
    metadata_keys = tuple("slide_" + key for key in slide_metadata)
    csv_columns: Tuple[str, ...] = (*keys_to_save, *metadata_keys)
    print(csv_columns)
    slide_id: str = sample["slide_id"]
    rel_slide_dir = Path(slide_id)
    output_tiles_dir = output_dir / rel_slide_dir
    logging.info(f">>> Slide dir {output_tiles_dir}")
    if is_already_processed(output_tiles_dir):
        logging.info(f">>> Skipping {output_tiles_dir} - already processed")
        return output_tiles_dir

    else:
        output_tiles_dir.mkdir(parents=True, exist_ok=True)
        dataset_csv_path = output_tiles_dir / "dataset.csv"
        dataset_csv_file = dataset_csv_path.open('w')
        dataset_csv_file.write(','.join(csv_columns) + '\n')  # write CSV header

        n_failed_tiles = 0
        failed_tiles_csv_path = output_tiles_dir / "failed_tiles.csv"
        failed_tiles_file = failed_tiles_csv_path.open('w')
        failed_tiles_file.write('tile_id' + '\n')

        slide_image_path = Path(sample["image"])
        logging.info(f"Loading slide {slide_id} ...\nFile: {slide_image_path}")

        # Somehow it's very slow on Datarbicks
        # hack: copy the slide file to a temporary directory
        tmp_dir = tempfile.TemporaryDirectory()
        tmp_slide_image_path = Path(tmp_dir.name) / slide_image_path.name
        logging.info(f">>> Copying {slide_image_path} to {tmp_slide_image_path}")
        shutil.copy(slide_image_path, tmp_slide_image_path)
        sample["image"] = tmp_slide_image_path
        logging.info(f">>> Finished copying {slide_image_path} to {tmp_slide_image_path}")

        # Save original slide thumbnail
        save_thumbnail(slide_image_path, thumbnail_dir / (slide_image_path.name + "_original.png"))

        loader = LoadROId(WSIReader(backend="OpenSlide"), level=level, margin=margin,
                          foreground_threshold=foreground_threshold)
        sample = loader(sample)  # load 'image' from disk

        # Save ROI thumbnail
        slide_image = sample["image"]
        plt.figure()
        plt.imshow(slide_image.transpose(1, 2, 0))
        plt.savefig(thumbnail_dir / (slide_image_path.name + "_roi.png"))
        plt.close()
        logging.info(f"Saving thumbnail {thumbnail_dir / (slide_image_path.name + '_roi.png')}, shape {slide_image.shape}")

        logging.info(f"Tiling slide {slide_id} ...")
        image_tiles, rel_tile_locations, occupancies, _ = \
            generate_tiles(sample["image"], tile_size,
                            sample["foreground_threshold"],
                            occupancy_threshold)

        # origin in level-0 coordinate
        # location in the current level coordiante
        # tile_locations in level-0 coordinate
        tile_locations = (sample["scale"] * rel_tile_locations
                            + sample["origin"]).astype(int)  # noqa: W503

        n_tiles = image_tiles.shape[0]
        logging.info(f"{n_tiles} tiles found")

        tile_info_list = []

        logging.info(f"Saving tiles for slide {slide_id} ...")
        for i in tqdm(range(n_tiles), f"Tiles ({slide_id[:6]}…)", unit="img", disable=not tile_progress):
            try:
                tile_info = get_tile_info(sample, occupancies[i], tile_locations[i], rel_slide_dir)
                tile_info_list.append(tile_info)

                save_image(image_tiles[i], output_dir / tile_info["image"])
                dataset_row = format_csv_row(tile_info, keys_to_save, metadata_keys)
                dataset_csv_file.write(dataset_row + '\n')
            except Exception as e:
                n_failed_tiles += 1
                descriptor = get_tile_descriptor(tile_locations[i])
                failed_tiles_file.write(descriptor + '\n')
                traceback.print_exc()
                warnings.warn(f"An error occurred while saving tile "
                                f"{get_tile_id(slide_id, tile_locations[i])}: {e}")

        dataset_csv_file.close()
        failed_tiles_file.close()

        # tile location overlay
        visualize_tile_locations(sample, thumbnail_dir / (slide_image_path.name + "_roi_tiles.png"), tile_info_list, tile_size, origin_offset=sample["origin"])

        if n_failed_tiles > 0:
            # TODO what we want to do with slides that have some failed tiles?
            logging.warning(f"{slide_id} is incomplete. {n_failed_tiles} tiles failed.")

        logging.info(f"Finished processing slide {slide_id}")

        return output_tiles_dir


def merge_dataset_csv_files(dataset_dir: Path) -> Path:
    """Combines all "*/dataset.csv" files into a single "dataset.csv" file in the given directory."""
    full_csv = dataset_dir / "dataset.csv"
    # TODO change how we retrieve these filenames, probably because mounted, the operation is slow
    #  and it seems to find many more files
    # print("List of files")
    # print([str(file) + '\n' for file in dataset_dir.glob("*/dataset.csv")])
    with full_csv.open('w') as full_csv_file:
        # full_csv_file.write(','.join(CSV_COLUMNS) + '\n')  # write CSV header
        first_file = True
        for slide_csv in tqdm(dataset_dir.glob("*/dataset.csv"), desc="Merging dataset.csv", unit='file'):
            logging.info(f"Merging slide {slide_csv}")
            content = slide_csv.read_text()
            if not first_file:
                content = content[content.index('\n') + 1:]  # discard header row for all but the first file
            full_csv_file.write(content)
            first_file = False
    return full_csv


# def main(slides_dataset: "SlidesDataset", root_output_dir: Union[str, Path],
#          level: int, tile_size: int, margin: int, foreground_threshold: Optional[float],
#          occupancy_threshold: float, parallel: bool = False, overwrite: bool = False,
#          n_slides: Optional[int] = None) -> None:
#     """Process a slides dataset to produce a tiles dataset.

#     :param slides_dataset: Input tiles dataset object.
#     :param root_output_dir: The root directory of the output tiles dataset.
#     :param level: Magnification level at which to process the slide.
#     :param tile_size: Lateral dimensions of each tile, in pixels.
#     :param margin: Margin around the foreground bounding box, in pixels at lowest resolution.
#     :param foreground_threshold: Luminance threshold (0 to 255) to determine tile occupancy.
#     If `None` (default), an optimal threshold will be estimated automatically.
#     :param occupancy_threshold: Threshold (between 0 and 1) to determine empty tiles to discard.
#     :param parallel: Whether slides should be processed in parallel with multiprocessing.
#     :param overwrite: Whether to overwrite an existing output tiles dataset. If `True`, will delete
#     and recreate `root_output_dir`, otherwise will resume by skipping already processed slides.
#     :param n_slides: If given, limit the total number of slides for debugging.
#     """

#     # Ignoring some types here because mypy is getting confused with the MONAI Dataset class
#     # to select a subsample use keyword n_slides
#     dataset = Dataset(slides_dataset)[:n_slides]  # type: ignore

#     # make sure all slide files exist in the image dir
#     for sample in dataset:
#         image_path = Path(sample["image_path"])
#         assert image_path.exists(), f"{image_path} doesn't exist"

#     output_dir = Path(root_output_dir)
#     logging.info(f"Creating dataset of level-{level} {tile_size}x{tile_size} "
#                  f"{slides_dataset.__class__.__name__} tiles at: {output_dir}")

#     if overwrite and output_dir.exists():
#         shutil.rmtree(output_dir)
#     output_dir.mkdir(parents=True, exist_ok=not overwrite)
#     thumbnail_dir = output_dir / "thumbnails"
#     thumbnail_dir.mkdir(exist_ok=True)
#     logging.info(f"Thumbnail directory: {thumbnail_dir}")

#     func = functools.partial(process_slide, level=level, margin=margin, tile_size=tile_size,
#                              foreground_threshold=foreground_threshold,
#                              occupancy_threshold=occupancy_threshold, output_dir=output_dir,
#                              thumbnail_dir=thumbnail_dir,
#                              tile_progress=not parallel)

#     if parallel:
#         import multiprocessing

#         pool = multiprocessing.Pool()
#         map_func = pool.imap_unordered  # type: ignore
#     else:
#         map_func = map  # type: ignore

#     list(tqdm(map_func(func, dataset), desc="Slides", unit="img", total=len(dataset)))  # type: ignore

#     if parallel:
#         pool.close()

#     logging.info("Merging slide files in a single file")
#     merge_dataset_csv_files(output_dir)




################   My code ##############

import openslide
import numpy as np

def compute_tile_size(slide_path, patch_size_target, upp_target):
    slide = openslide.OpenSlide(slide_path)
    
    # Check for MPP information
    mpp_x = slide.properties.get('openslide.mpp-x')
    mpp_y = slide.properties.get('openslide.mpp-y')
    
    if mpp_x and mpp_y:
        mpp_x = float(mpp_x)
        mpp_y = float(mpp_y)
    else:
        # Check for resolution information
        x_resolution = slide.properties.get('tiff.XResolution')
        y_resolution = slide.properties.get('tiff.YResolution')
        resolution_unit = slide.properties.get('tiff.ResolutionUnit')
        
        if x_resolution and y_resolution:
            x_resolution = float(x_resolution)
            y_resolution = float(y_resolution)
            
            if resolution_unit == 'centimeter':
                mpp_x = 10000 / x_resolution
                mpp_y = 10000 / y_resolution
            elif resolution_unit == 'inch':
                mpp_x =  x_resolution
                mpp_y = y_resolution
            else:
                raise ValueError(f"Unknown resolution unit: {resolution_unit} for slide {slide_path}")
        else:
            raise ValueError(f"Slide {slide_path} does not have MPP or resolution information.")
    print(mpp_x)
    slide_resolution = (mpp_x + mpp_y) / 2  # Average resolution if x and y resolutions are different
    tile_size = int(patch_size_target * (upp_target / slide_resolution))
    return tile_size


def process_and_check_slide(sample, level, margin, foreground_threshold, occupancy_threshold, output_dir, thumbnail_dir, parallel, patch_size_target, upp_target):
    slide_path = sample["image"]
    tile_size = compute_tile_size(slide_path, patch_size_target, upp_target)
    print(f"Processing {slide_path} with tile size {tile_size}")

    slide_dir = process_slide(
        sample=sample,
        level=level,
        margin=margin,
        tile_size=tile_size,
        foreground_threshold=foreground_threshold,
        occupancy_threshold=occupancy_threshold,
        output_dir=output_dir,
        thumbnail_dir=thumbnail_dir,
        tile_progress=not parallel
    )

    # Additional verification steps
    dataset_csv_path = slide_dir / "dataset.csv"
    dataset_df = pd.read_csv(dataset_csv_path)
    assert len(dataset_df) > 0
    failed_csv_path = slide_dir / "failed_tiles.csv"
    failed_df = pd.read_csv(failed_csv_path)
    assert len(failed_df) == 0

    print(f"Slide {slide_path} has been tiled. {len(dataset_df)} tiles saved to {slide_dir}.")
 
def main(slides_dataset: "SlidesDataset", root_output_dir: Union[str, Path],
         patch_size_target: int, upp_target: float, level: int, margin: int,
         foreground_threshold: Optional[float], occupancy_threshold: float, 
         parallel: bool = False, overwrite: bool = False, n_slides: Optional[int] = None) -> None:
    """Process a slides dataset to produce a tiles dataset with appropriate tile size.

    :param slides_dataset: Input slides dataset object.
    :param root_output_dir: The root directory of the output tiles dataset.
    :param patch_size_target: Target patch size.
    :param upp_target: Target microns per pixel (MPP).
    :param level: Magnification level at which to process the slide.
    :param margin: Margin around the foreground bounding box, in pixels at lowest resolution.
    :param foreground_threshold: Luminance threshold (0 to 255) to determine tile occupancy.
    If `None` (default), an optimal threshold will be estimated automatically.
    :param occupancy_threshold: Threshold (between 0 and 1) to determine empty tiles to discard.
    :param parallel: Whether slides should be processed in parallel with multiprocessing.
    :param overwrite: Whether to overwrite an existing output tiles dataset. If `True`, will delete
    and recreate `root_output_dir`, otherwise will resume by skipping already processed slides.
    :param n_slides: If given, limit the total number of slides for debugging.
    """
    dataset = Dataset(slides_dataset)[:n_slides]  # type: ignore

    # Make sure all slide files exist in the image dir
    for sample in dataset:
        image_path = Path(sample["image"])
        assert image_path.exists(), f"{image_path} doesn't exist"

    output_dir = Path(root_output_dir)
    logging.info(f"Creating dataset of level-{level} {patch_size_target}x{patch_size_target} "
                 f"{slides_dataset.__class__.__name__} tiles at: {output_dir}")

    if overwrite and output_dir.exists():
        shutil.rmtree(output_dir)
    output_dir.mkdir(parents=True, exist_ok=not overwrite)
    thumbnail_dir = output_dir / "thumbnails"
    thumbnail_dir.mkdir(exist_ok=True)
    logging.info(f"Thumbnail directory: {thumbnail_dir}")


    func = functools.partial(process_and_check_slide, level=level, margin=margin, 
                             foreground_threshold=foreground_threshold, 
                             occupancy_threshold=occupancy_threshold, 
                             output_dir=output_dir, thumbnail_dir=thumbnail_dir, 
                             parallel=parallel, patch_size_target=patch_size_target, 
                             upp_target=upp_target)

    if parallel:
        import multiprocessing
        pool = multiprocessing.Pool()
        map_func = pool.imap_unordered  # type: ignore
    else:
        map_func = map  # type: ignore

    list(tqdm(map_func(func, dataset), desc="Slides", unit="img", total=len(dataset)))  # type: ignore

    if parallel:
        pool.close()

    logging.info("Merging slide files in a single file")
    merge_dataset_csv_files(output_dir)



if __name__ == "__main__": 
   import os
   import pandas as pd
   
   # Define the path to the CSV file and the root directory
   slides_csv_path = "/projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/dataset_csv/braf/slide_labels.csv"
   root_dir = "/projects/wispermed/TCGA/Whole Slide Images"
   image_type= ".svs"
   # Read the CSV file 
   slides_df = pd.read_csv(slides_csv_path)
   slides_dataset = []
   for index, row in slides_df.iterrows():
       slide_path = os.path.join(root_dir, row['slide_id'])
       slides_dataset.append({"image": slide_path + image_type, "slide_id": str(row['slide_id']).split('/')[0], "label": row['label'], "metadata": {}})
   
   # Define other parameters
   root_output_dir = "/projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data"
   patch_size_target = 256
   upp_target = 0.5 
   level = 0
   margin = 0  
   foreground_threshold = None   
   occupancy_threshold = 0.1
   parallel = True 
   overwrite = False      
   n_slides = None
   
   # Call the main function with the prepared slides_dataset
   main(slides_dataset, root_output_dir, patch_size_target, upp_target, level, margin,
        foreground_threshold, occupancy_threshold, parallel, overwrite, n_slides)

  from .autonotebook import tqdm as notebook_tqdm
  warn(
Slides:   0%|          | 0/276 [00:00<?, ?img/s]

0.248
Processing /projects/wispermed/TCGA/Whole Slide Images/01c251e9-cf0f-415a-b4cc-bc1b44f59d16/TCGA-D9-A4Z6-06Z-00-DX1.9AE8803C-4560-4E79-97A8-0CD6DA1890DC.svs with tile size 516
0.248
0.2456
0.504
0.5040.5040.504
Processing /projects/wispermed/TCGA/Whole Slide Images/0495a87c-1ce4-49e1-ac81-228c8e864124/TCGA-EE-A2MS-06Z-00-DX1.76FF55CE-10D0-4A93-AB1D-81AFB910410B.svs with tile size 521Processing /projects/wispermed/TCGA/Whole Slide Images/19de5c45-87b3-40c3-b4d6-8f1a8f76d115/TCGA-D3-A5GL-06Z-00-DX1.94EB7EB4-7906-4D03-BBF3-D25BD7517D91.svs with tile size 2530.2525

0.504Processing /projects/wispermed/TCGA/Whole Slide Images/163f044f-4ca2-4516-8995-e8421a1df6ce/TCGA-D3-A1Q6-06Z-00-DX1.01DE293A-473C-4011-88DF-4863F7CE1843.svs with tile size 253

Processing /projects/wispermed/TCGA/Whole Slide Images/0e24042d-2f15-4635-a7bf-8f138055ab3d/TCGA-D3-A5GO-06Z-00-DX1.DF1F3837-B866-40F4-B2E7-244F94DD0065.svs with tile size 253Processing /projects/wispermed/TCGA/Whole Slide Images/09b5761f-5852

Slides:   0%|          | 1/276 [00:00<02:28,  1.85img/s]




0.2456

Processing /projects/wispermed/TCGA/Whole Slide Images/09d1bbf4-f52c-4804-ac22-a640dbfb3338/TCGA-GN-A8LN-01Z-00-DX1.68E5D36A-6C6D-4BCD-B005-25B978441EE6.svs with tile size 506Processing /projects/wispermed/TCGA/Whole Slide Images/22737292-1d31-4875-90cc-6a87b59ca4d0/TCGA-FS-A1Z7-06Z-00-DX7.6E09A9E9-82D9-4D29-9E7F-14BC9C5DC105.svs with tile size 521

Processing /projects/wispermed/TCGA/Whole Slide Images/286a5c62-fe96-4c7c-a32b-de6527a6e4a8/TCGA-XV-AAZW-01Z-00-DX1.26C215F6-0EFA-42D9-A3EF-58466997594B.svs with tile size 506
Processing /projects/wispermed/TCGA/Whole Slide Images/21dc1039-3f54-4f09-933b-a03c9d2a9245/TCGA-BF-A3DN-01Z-00-DX1.4A5A739C-FB1D-44A5-9CE2-FC4966D843B6.svs with tile size 506

Processing /projects/wispermed/TCGA/Whole Slide Images/2b0aea10-2963-4db4-8b40-f89ee679a67d/TCGA-EB-A5SE-01Z-00-DX1.2E10E7EF-B902-479F-ACBF-E852836FF7B9.svs with tile size 5210.2527
Processing /projects/wispermed/TCGA/Whole Slide Images/2e2b1176-4737-4148-a235-551f0279eb17/TCGA-BF-A3

Slides:   1%|          | 3/276 [00:00<00:59,  4.57img/s]



Slide /projects/wispermed/TCGA/Whole Slide Images/19c863c7-3ce7-44c4-aece-73fefdf6c569/TCGA-FW-A3I3-06Z-00-DX1.C00AB7EB-C272-4B74-B540-47B31D731EDB.svs has been tiled. 2230 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/19c863c7-3ce7-44c4-aece-73fefdf6c569.Processing /projects/wispermed/TCGA/Whole Slide Images/279cbaee-a6a7-45e0-95db-158ab6186cfa/TCGA-FS-A1Z7-06Z-00-DX9.7F36F443-CADB-4146-8B26-82E60DC88E55.svs with tile size 521('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')Slide /projects/wispermed/TCGA/Whole Slide Images/0e24042d-2f15-4635-a7bf-8f138055ab3d/TCGA-D3-A5GO-06Z-00-DX1.DF1F3837-B866-40F4-B2E7-244F94DD0065.svs has been tiled. 9958 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/0e24042d-2f15-4635-a7bf-8f138055ab3d.


('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occ

Slides:   3%|▎         | 9/276 [00:00<00:17, 15.11img/s]

('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')
Slide /projects/wispermed/TCGA/Whole Slide Images/2d190099-a1d6-4be4-a5ec-dcc41a3eded6/TCGA-EB-A3XC-01Z-00-DX1.F50679D7-D525-4F20-980F-AF43D7E9EBE6.svs has been tiled. 3974 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/2d190099-a1d6-4be4-a5ec-dcc41a3eded6.
('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')

('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')Slide /projects/wispermed/TCGA/Whole Slide Images/294d319f-1a8b-40c5-9ed5-0f81f9b8cae7/TCGA-EB-A44N-01Z-00-DX1.C36462FD-3059-4652-9E6B-2BE0B2CBC9BF.svs has been tiled. 2763 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/294d319f-1a8b-40c5-9ed5-0f81f9b8cae7.





0.504

Process

Slides:   5%|▌         | 14/276 [00:01<00:13, 18.73img/s]



('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')0.24680.2527Processing /projects/wispermed/TCGA/Whole Slide Images/3d3794ec-04ba-4ce4-8c63-e09dee4f597f/TCGA-D3-A8GO-06Z-00-DX1.357CD90F-23D3-45BB-BA13-DCF5AED677C1.svs with tile size 510

Slide /projects/wispermed/TCGA/Whole Slide Images/0a6ef6d7-4e0b-4b23-90a7-da39e0eaf5af/TCGA-D3-A2JL-06Z-00-DX1.3258F79C-866E-4AC5-BB16-F4DF65E9DFC2.svs has been tiled. 10202 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/0a6ef6d7-4e0b-4b23-90a7-da39e0eaf5af.
Slide /projects/wispermed/TCGA/Whole Slide Images/11817697-9ee0-493e-96b9-72fb42bb92f1/TCGA-D3-A3CF-06Z-00-DX1.34257FE7-9EF7-454E-8384-3BB81771F629.svs has been tiled. 13743 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/11817697-9ee0-493e-96b9-72fb42bb92f1.
0.504

('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')
Processing /projects/wispermed/TCGA/Whole S

Slides:   6%|▌         | 17/276 [00:01<00:14, 18.21img/s]



('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')

('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')


Slide /projects/wispermed/TCGA/Whole Slide Images/3cdabb85-888e-4392-9fbe-bbf896579a8d/TCGA-D3-A51H-06Z-00-DX1.33682556-776F-42B4-BA36-05E973FF276E.svs has been tiled. 11264 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/3cdabb85-888e-4392-9fbe-bbf896579a8d.
0.24720.2468Slide /projects/wispermed/TCGA/Whole Slide Images/37639dcb-c628-429c-944f-ad1d776aa006/TCGA-D3-A3C7-06Z-00-DX1.A58D75D7-369A-4CC0-B2AB-E781AA162915.svs has been tiled. 18982 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/37639dcb-c628-429c-944f-ad1d776aa006.
0.248
Processing /projects/wispermed/TCGA/Whole Slide Images/428f3fbb-413d-464a-b27e-1da07456daf1/TCGA-ER-A3ET-06Z-00-DX1.8EBAFF1A-E8FE-446A-8A10-FF4

Slides:   7%|▋         | 20/276 [00:01<00:15, 16.07img/s]


Processing /projects/wispermed/TCGA/Whole Slide Images/40bcb89b-3f07-49a9-9bbc-280e4a0e4a41/TCGA-IH-A3EA-01Z-00-DX1.4CB876AA-E46B-4C4C-B6B3-3EF3C4525E7F.svs with tile size 518
('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')0.2527('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')
Processing /projects/wispermed/TCGA/Whole Slide Images/43116fe4-e179-4114-a3c5-233232bacf50/TCGA-WE-A8ZN-06Z-00-DX1.9134D58D-8DE3-4310-9E0E-8095CF3FDA6C.svs with tile size 506


('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')
0.2485
Processing /projects/wispermed/TCGA/Whole Slide Images/40fae70d-723f-40bb-9aab-cb8452a57c9b/TCGA-EB-A5UL-06Z-00-DX1.1E65A435-D10E-416E-A23B-6C31BAA9A975.svs with tile size 515
('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')


Slides:   7%|▋         | 20/276 [00:20<00:15, 16.07img/s]

0.504
Processing /projects/wispermed/TCGA/Whole Slide Images/43aeb055-d4a9-42fc-9a97-3422c301a872/TCGA-D3-A2JA-06Z-00-DX1.B2D576DF-C876-4385-9F85-23ACBDF30AF7.svs with tile size 253
('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')
0.2505
Processing /projects/wispermed/TCGA/Whole Slide Images/4465db67-b865-48c9-af75-846170f8bb92/TCGA-D3-A8GK-06Z-00-DX1.EB94CDEC-CA59-4901-B979-0B22132BCA0E.svs with tile size 510
('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')


Slides:   7%|▋         | 20/276 [01:40<21:31,  5.04s/img]
IOStream.flush timed out


Slide /projects/wispermed/TCGA/Whole Slide Images/43aeb055-d4a9-42fc-9a97-3422c301a872/TCGA-D3-A2JA-06Z-00-DX1.B2D576DF-C876-4385-9F85-23ACBDF30AF7.svs has been tiled. 21923 tiles saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_data/43aeb055-d4a9-42fc-9a97-3422c301a872.
0.2456
Processing /projects/wispermed/TCGA/Whole Slide Images/44f988cb-1896-4f12-95f0-84044d4677fd/TCGA-FS-A1ZW-06Z-00-DX2.EBF59BD5-B85F-4FDF-B52C-2062F03BA59E.svs with tile size 521
('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')
0.2485
Processing /projects/wispermed/TCGA/Whole Slide Images/4517f63e-a7e3-4fbf-9832-88dcb0502f69/TCGA-EB-A97M-01Z-00-DX1.631C79E5-6973-4BA2-A4B3-E57B3991851A.svs with tile size 515
('slide_id', 'tile_id', 'image', 'label', 'tile_x', 'tile_y', 'occupancy')


In [9]:
import os
import pandas as pd

# Define the path to the directory containing the files
path = "/projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_tiles_embeddings"

# Get a list of all files in the directory
files = os.listdir(path)
print("len: ", len(files))
# Remove the endings from the filenames to get the slide_id
slide_ids = [os.path.splitext(file)[0] for file in files]

# Create a DataFrame from the list of slide_ids
df_slide_ids = pd.DataFrame(slide_ids, columns=["slide_id"])

# Read the CSV file where slide_id and their labels are saved
labels_csv_path = "/projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/dataset_csv/braf/slide_labels.csv"
df_labels = pd.read_csv(labels_csv_path)

# Merge the DataFrame with slide_ids with the labels DataFrame on the slide_id
df_merged = df_slide_ids.merge(df_labels, on="slide_id", how="left")

# Define the output CSV file name
output_csv = "/projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/dataset_csv/braf/tiles_embeddings_with_labels.csv"

# Save the merged DataFrame to a CSV file
df_merged.to_csv(output_csv, index=False)

print(f"Slide IDs and labels saved to {output_csv}")


len:  79
Slide IDs and labels saved to /projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/dataset_csv/braf/tiles_embeddings_with_labels.csv


In [10]:
import os
import h5py
import numpy as np

class SlideDataset:
    def __init__(self, dataset_dir):
        self.dataset_dir = dataset_dir

    def get_images_from_path(self, slide_path):
        assets, valid = self.read_assets_from_h5(slide_path)
        if not valid:
            raise ValueError(f"Invalid data found in {slide_path}")
        return assets

    def read_assets_from_h5(self, img_path):
        assets = {}
        try:
            with h5py.File(img_path, 'r') as f:
                for key in f.keys():
                    try:
                        if isinstance(f[key], h5py.Dataset):
                            if f[key].shape == ():
                                assets[key] = f[key][()]
                            else:
                                assets[key] = f[key][:]
                        else:
                            assets[key] = f[key][:]
                    except Exception as e:
                        print(f"Error reading key {key} in {img_path}: {e}")
                        return assets, False
        except Exception as e:
            print(f"Error opening {img_path}: {e}")
            return assets, False
        return assets, True

    def get_sample_with_try(self, idx):
        try:
            return self.get_one_sample(idx)
        except Exception as e:
            print(f"Error fetching sample {idx}: {e}")
            return None

    def get_one_sample(self, idx):
        slide_path = self.get_slide_path(idx)
        return self.get_images_from_path(slide_path)

    def get_slide_path(self, idx):
        # Implement logic to get the slide path from idx
        pass

# Example usage
dataset_dir = "/projects/wispermed_rp18/braf-main/prov-gigapath/prov-gigapath/data/tcga_tiles_embeddings"
slide_dataset = SlideDataset(dataset_dir)
files = os.listdir(dataset_dir)
for path in files:
    try:
        assets = slide_dataset.get_images_from_path(dataset_dir+"/"+ path)
        print("Assets loaded successfully:", assets)
    except ValueError as e:
        print("Error reading: ", path)
        print(e)
        break 



Assets loaded successfully: {'coords': array([[ 89.,  70.],
       [ 90., 139.],
       [105., 111.],
       [118.,  58.],
       [125.,  72.],
       [145.,  25.],
       [150.,  43.],
       [153., 126.],
       [159., 106.],
       [170., 118.]], dtype=float32), 'features': array([[ 0.049443  ,  0.97804344, -0.8197156 , ...,  2.3214386 ,
         1.4356278 , -0.52892303],
       [-0.9823823 ,  1.5186924 , -0.82085717, ..., -0.5988735 ,
        -0.70286864, -0.45147374],
       [ 0.5559826 , -0.80002284, -0.1242203 , ..., -0.36832497,
        -0.3149539 , -0.65201813],
       ...,
       [-0.21340914,  0.68733644, -0.8772799 , ..., -1.5954112 ,
         0.03791266,  0.03512036],
       [-0.26507366,  0.33329794, -0.98981035, ..., -0.9901672 ,
        -0.78840816, -0.7272701 ],
       [-0.7281139 , -0.28826854, -0.17802751, ..., -1.1287873 ,
        -0.19616517, -0.1366063 ]], dtype=float32), 'label': array([0])}
Assets loaded successfully: {'coords': array([[ 23.,  36.],
       [ 23.

No kernel connected

No kernel connected

No kernel connected

No kernel connected

No kernel connected

No kernel connected

No kernel connected

No kernel connected

No kernel connected