The data is from the hiPSC single cell image dataset from the Allen Institute for Cell Science: https://open.quiltdata.com/b/allencell/packages/aics/hipsc_single_cell_image_dataset. Original publication of the data: 

Viana, M.P., Chen, J., Knijnenburg, T.A. et al. Integrated intracellular organization and its variations in human iPS cells. Nature 613, 345–354 (2023). https://doi.org/10.1038/s41586-022-05563-7

This demo will gather data to demonstrate how to train a model to generate 3D synthetic microscopy images from binary masks, using two types of data: nucleophosmin (NPM1) and nuclei (HIST1H2BJ).

In [None]:
# you need quilt3 package to download the data:
! pip install quilt3

In [1]:
import pandas as pd
import quilt3
from pathlib import Path
from aicsimageio import AICSImage
from aicsimageio.writers import OmeTiffWriter
from random import random, sample
from shutil import move
import numpy as np

In [None]:
# we use the NPM1 and HIST1H2BJ cell lines for this demo
cline = "HIST1H2BJ" #"HISTH2BJ" # "NPM1"
segmentation_details = "coarse" # "coarse" # "fine"   # coarse or fine segmentation, we don't use coarse segmentation for NPM1

# set up path
parent_path = Path("../../data/synthetic3D")

raw_path = parent_path / Path("download")
raw_path.mkdir(exist_ok=True)
train_path = parent_path / f"{cline}_{segmentation_details}" / Path("train")
train_path.mkdir(exist_ok=True, parents=True)
holdout_path = parent_path / f"{cline}_{segmentation_details}"/ Path("holdout")
holdout_path.mkdir(exist_ok=True, parents=True)

In [9]:
# connect to quilt and load meta table
pkg = quilt3.Package.browse(
    "aics/hipsc_single_cell_image_dataset", registry="s3://allencell"
)
meta_df_obj = pkg["metadata.csv"]
meta_df_obj.fetch(parent_path / "meta.csv")
meta_df = pd.read_csv(parent_path / "meta.csv")

# fetch the data of the specific cell line
meta_df_line = meta_df.query("structure_name==@cline")

# collapse the data table based on FOVId
meta_df_line.drop_duplicates(subset="FOVId", inplace=True)

# reset index
meta_df_line.reset_index(drop=True, inplace=True)

  meta_df = pd.read_csv(parent_path / "meta.csv")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df_line.drop_duplicates(subset="FOVId", inplace=True)


In [10]:
# download the images and segmentation
# we only need a small amount of data for the purpose of demonstration
num_of_sample = 50  # we used 400 for demo, but choose the amount you need
tmp_list = []
for row in meta_df_line.itertuples():
    if row.Index > num_of_sample:
        break
    # fetch the raw image (multi-channel)
    subdir_name = row.fov_path.split("/")[0]
    file_name = row.fov_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_original.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure channel
    reader = AICSImage(local_fn)
    img = reader.get_image_data("ZYX", C=row.ChannelNumberStruct, S=0, T=0)

    # fetch segmentation (use nuclear segmentation for H2B,
    # and use structure segmentation for the others)
    if cline == "HIST1H2BJ" and segmentation_details == "coarse":
        subdir_name = row.fov_seg_path.split("/")[0]
        file_name = row.fov_seg_path.split("/")[1]
    else:
        subdir_name = row.struct_seg_path.split("/")[0]
        file_name = row.struct_seg_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_seg.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure segmentation
    reader = AICSImage(local_fn)
    seg = reader.get_image_data("ZYX", C=0, S=0, T=0).astype(np.uint8)
    seg[seg > 0] = 1

    if random() < 0.2:
        out_path = holdout_path
    else:
        out_path = train_path
    
    # save the data
    im_fn = out_path / f"{row.FOVId}_GT.tiff"
    seg_fn = out_path / f"{row.FOVId}_IM.tiff"
    OmeTiffWriter.save(img, im_fn, dim_order="ZYX")
    OmeTiffWriter.save(seg, seg_fn, dim_order="ZYX")

100%|██████████| 190M/190M [00:07<00:00, 23.9MB/s] 
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 479k/479k [00:06<00:00, 79.6kB/s]
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 191M/191M [00:14<00:00, 12.9MB/s] 
100%|██████████| 575k/575k [00:03<00:00, 166kB/s] 
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 192M/192M [00:09<00:00, 19.8MB/s] 
100%|██████████| 579k/579k [00:07<00:00, 76.3kB/s]
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 195M/195M [00:09<00:00, 20.3MB/s] 
100%|██████████| 535k/535k [00:03<00:00, 150kB/s] 
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|███████

In [11]:
# you may remove the download folder now.
from shutil import rmtree
import os
rmtree(raw_path)
os.remove(parent_path / "meta.csv")
