The data is from the hiPSC single cell image dataset from the Allen Institute for Cell Science: https://open.quiltdata.com/b/allencell/packages/aics/hipsc_single_cell_image_dataset. Original publication of the data: 

Viana, M.P., Chen, J., Knijnenburg, T.A. et al. Integrated intracellular organization and its variations in human iPS cells. Nature 613, 345–354 (2023). https://doi.org/10.1038/s41586-022-05563-7

This demo will gather data to demonstrate how to train a model to generate 2D synthetic microscopy images from binary masks, using dataset: nucleophosmin (NPM1).

In [None]:
# you need quilt3 package to download the data:
! pip install quilt3

In [None]:
import pandas as pd
import quilt3
from pathlib import Path
from aicsimageio.writers import OmeTiffWriter
from random import random, sample
from shutil import move
import numpy as np
from bioio import BioImage

In [None]:
# we use the NPM1 and HIST1H2BJ cell lines for this demo
cline = "NPM1"

# set up path
parent_path = Path("../../data/synthetic2D")
parent_path.mkdir(exist_ok=True, parents=True)

raw_path = parent_path / Path("download")
raw_path.mkdir(exist_ok=True)
train_path = parent_path / f"{cline}" / Path("train")
train_path.mkdir(exist_ok=True, parents=True)
holdout_path = parent_path / f"{cline}"/ Path("holdout")
holdout_path.mkdir(exist_ok=True, parents=True)

In [None]:
# connect to quilt and load meta table
pkg = quilt3.Package.browse(
    "aics/hipsc_single_cell_image_dataset", registry="s3://allencell"
)
meta_df_obj = pkg["metadata.csv"]
meta_df_obj.fetch(parent_path / "meta.csv")
meta_df = pd.read_csv(parent_path / "meta.csv")

# fetch the data of the specific cell line
meta_df_line = meta_df.query("structure_name==@cline")

# collapse the data table based on FOVId
meta_df_line.drop_duplicates(subset="FOVId", inplace=True)

# reset index
meta_df_line.reset_index(drop=True, inplace=True)

In [None]:
# download the images and segmentation
# we only need a small amount of data for the purpose of demonstration
num_of_sample = 400
tmp_list = []
for row in meta_df_line.itertuples():
    if row.Index > num_of_sample:
        break
    # fetch the raw image (multi-channel)
    subdir_name = row.fov_path.split("/")[0]
    file_name = row.fov_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_original.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure channel
    reader = BioImage(local_fn)
    img = reader.get_image_data("ZYX", C=row.ChannelNumberStruct, S=0, T=0)

    subdir_name = row.struct_seg_path.split("/")[0]
    file_name = row.struct_seg_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_seg.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure segmentation
    reader = BioImage(local_fn)
    seg = reader.get_image_data("ZYX", C=0, S=0, T=0).astype(np.uint8)
    seg[seg > 0] = 1

    if random() < 0.2:
        out_path = holdout_path
    else:
        out_path = train_path

    # find mid z
    z_range = np.where(np.any(seg > 0, axis=(1, 2)))
    z_range = z_range[0]
    mid_z = (z_range[-1] + z_range[0]) // 2
    mid_z = int(mid_z)

    # save the data
    im_fn = out_path / f"{row.FOVId}_GT.tiff"
    seg_fn = out_path / f"{row.FOVId}_IM.tiff"
    OmeTiffWriter.save(img[mid_z, :, :], im_fn, dim_order="YX")
    OmeTiffWriter.save(seg[mid_z, :, :], seg_fn, dim_order="YX")

In [None]:
# you may remove the download folder now.
from shutil import rmtree
import os
rmtree(raw_path)
os.remove(parent_path / "meta.csv")
