The data is from the hiPSC single cell image dataset from the Allen Institute for Cell Science: https://open.quiltdata.com/b/allencell/packages/aics/hipsc_single_cell_image_dataset. Original publication of the data: 

Viana, M.P., Chen, J., Knijnenburg, T.A. et al. Integrated intracellular organization and its variations in human iPS cells. Nature 613, 345–354 (2023). https://doi.org/10.1038/s41586-022-05563-7

This demo will gather data to demonstrate 3D unsupervised segmentation of Golgi (ST6GAL1), mitochondria (TOMM20) and nuclei (HIST1H2BJ).

In [None]:
# you need quilt3 package to download the data:
! pip install quilt3

In [None]:
import pandas as pd
import quilt3
from pathlib import Path
from aicsimageio import AICSImage
from aicsimageio.writers import OmeTiffWriter
from random import random, sample
from shutil import move
import numpy as np

In [None]:
# we use the ST6GAL1, TOMM20 and HIST1H2BJ cell lines for this demo
cline = "HIST1H2BJ"  # "TOMM20" # "HIST1H2BJ" # ST6GAL1

# set up path
parent_path = Path("../../data/unsupervise3D")
parent_path.mkdir(exist_ok=True, parents=True)

raw_path = parent_path / Path("download")
raw_path.mkdir(exist_ok=True)
train_path = parent_path / f"{cline}" / Path("train")
train_path.mkdir(exist_ok=True, parents=True)
holdout_path = parent_path / f"{cline}"/ Path("holdout")
holdout_path.mkdir(exist_ok=True, parents=True)
tmp_path = parent_path / Path("tmp")
tmp_path.mkdir(exist_ok=True)

In [None]:
# connect to quilt and load meta table
pkg = quilt3.Package.browse(
    "aics/hipsc_single_cell_image_dataset", registry="s3://allencell"
)
meta_df_obj = pkg["metadata.csv"]
meta_df_obj.fetch(parent_path / "meta.csv")
meta_df = pd.read_csv(parent_path / "meta.csv")

# fetch the data of the specific cell line
meta_df_line = meta_df.query("structure_name==@cline")

# collapse the data table based on FOVId
meta_df_line.drop_duplicates(subset="FOVId", inplace=True)

# reset index
meta_df_line.reset_index(drop=True, inplace=True)

In [None]:
# download the images and segmentation
# we only need a small amount of data for the purpose of demonstration
num_of_sample = 200
tmp_list = []
for row in meta_df_line.itertuples():
    if row.Index > num_of_sample:
        break
    # fetch the raw image (multi-channel)
    subdir_name = row.fov_path.split("/")[0]
    file_name = row.fov_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_original.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure channel
    reader = AICSImage(local_fn)
    img = reader.get_image_data("ZYX", C=row.ChannelNumberStruct, S=0, T=0)

    # fetch segmentation (use nuclear segmentation for H2B,
    # and use structure segmentation for the others)
    if cline == "HIST1H2BJ":
        subdir_name = row.fov_seg_path.split("/")[0]
        file_name = row.fov_seg_path.split("/")[1]
    else:
        subdir_name = row.struct_seg_path.split("/")[0]
        file_name = row.struct_seg_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_seg.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure segmentation
    reader = AICSImage(local_fn)
    seg = reader.get_image_data("ZYX", C=0, S=0, T=0).astype(np.uint8)
    seg[seg > 0] = 1

    if random() < 0.2:
        # save as holdout
        im_fn = holdout_path / f"{row.FOVId}_IM.tiff"
        gt_fn = holdout_path / f"{row.FOVId}_GT.tiff"
        OmeTiffWriter.save(img, im_fn, dim_order="ZYX")
        OmeTiffWriter.save(seg, gt_fn, dim_order="ZYX")
    else:
        # save the grayscale image in the train path and save the
        # corresponding segmentation in tmp, and shuffle at the end
        im_fn = train_path / f"{row.FOVId}_IM.tiff"
        gt_fn = tmp_path / f"{row.FOVId}_GT.tiff"
        tmp_list.append(row.FOVId)
        OmeTiffWriter.save(img, im_fn, dim_order="ZYX")
        OmeTiffWriter.save(seg, gt_fn, dim_order="ZYX")

In [None]:
new_list = sample(tmp_list, len(tmp_list))
for old_id, new_id in zip(tmp_list, new_list):
    src_fn = tmp_path / f"{old_id}_GT.tiff"
    tar_fn = train_path / f"{new_id}_GT.tiff"
    move(src_fn, tar_fn)

In [None]:
# you may remove the download folder now.
from shutil import rmtree
import os
rmtree(raw_path)
rmtree(tmp_path)
os.remove(parent_path / "meta.csv")
