This notebook will download 3D data from the hiPSC single cell image dataset from the Allen Institute for Cell Science: https://open.quiltdata.com/b/allencell/packages/aics/hipsc_single_cell_image_dataset. 

Original publication of the data: 
Viana, M.P., Chen, J., Knijnenburg, T.A. et al. Integrated intracellular organization and its variations in human iPS cells. Nature 613, 345–354 (2023). https://doi.org/10.1038/s41586-022-05563-7.

We extract the middle slice from each 3D image to form 2D dataset.

In [None]:
# you need quilt3 package to download the data:
! pip install quilt3

In [3]:
import pandas as pd
import quilt3
from pathlib import Path
from aicsimageio import AICSImage
from aicsimageio.writers import OmeTiffWriter
import random
random.seed(0)

In [4]:
# turn off pandas parser warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# tunr off ome_types parser warning
warnings.filterwarnings("ignore", category=FutureWarning)

In [6]:
# set parameters

# which cell line to download: in the paper, we tested on four nuclear structures:
# - fibrillarin (cline = "FBL")
# - nucleophosmin (cline = "NPM1")
# - lamin b1 (cline = "LMNB1")
# - histon H2B (cline = "HIST1H2BJ")
cline = "FBL"
num_samples_per_cell_line = 500 # with roughly 80/20 training/validation split 

# set up path 3D
parent_path_3d = Path("../../data/labelfree3D") / f"{cline}"
parent_path_3d.mkdir(exist_ok=True,parents=True)
raw_path_3d = parent_path_3d / Path("download")
raw_path_3d.mkdir(exist_ok=True)
train_path_3d = parent_path_3d / Path("train")
train_path_3d.mkdir(exist_ok=True)
holdout_path_3d = parent_path_3d / Path("holdout")
holdout_path_3d.mkdir(exist_ok=True)

# set up path 2D
parent_path_2d = Path("../../data/labelfree2D") / f"{cline}"
parent_path_2d.mkdir(exist_ok=True,parents=True)
train_path_2d = parent_path_2d / Path("train")
train_path_2d.mkdir(exist_ok=True)
holdout_path_2d = parent_path_2d / Path("holdout")
holdout_path_2d.mkdir(exist_ok=True)

In [None]:
# connect to quilt and load meta table
pkg = quilt3.Package.browse(
    "aics/hipsc_single_cell_image_dataset", registry="s3://allencell"
)
meta_df_obj = pkg["metadata.csv"]
meta_df_obj.fetch(parent_path_3d / "meta.csv")
meta_df = pd.read_csv(parent_path_3d / "meta.csv")

# fetch the data of the specific cell line
meta_df_line = meta_df.query("structure_name==@cline")

# collapse the data table based on FOVId
meta_df_line.drop_duplicates(subset="FOVId", inplace=True)

# reset index
meta_df_line.reset_index(drop=True, inplace=True)

In [None]:
# download the images and re-slice into input (BF) and ground truth (fluorescent) images。
# takes around 4 hours
for row in meta_df_line.itertuples():
    if row.Index >= num_samples_per_cell_line:
        break
    
    # fetch the raw image (multi-channel)
    subdir_name = row.fov_path.split("/")[0]
    file_name = row.fov_path.split("/")[1]

    local_fn = raw_path_3d / f"{row.FOVId}_original.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the bf and structures channel
    reader = AICSImage(local_fn)
    bf_img = reader.get_image_data(
        "ZYX", C=row.ChannelNumberBrightfield, S=0, T=0
    )
    str_img = reader.get_image_data(
        "ZYX", C=row.ChannelNumberStruct, S=0, T=0
    )

    if random.random() < 0.2:
        data_path_3d = holdout_path_3d
        data_path_2d = holdout_path_2d
    else:
        data_path_3d = train_path_3d
        data_path_2d = train_path_2d

    im_fn = data_path_3d / f"{row.FOVId}_IM.tiff"
    gt_fn = data_path_3d / f"{row.FOVId}_GT.tiff"
    OmeTiffWriter.save(bf_img, im_fn, dim_order="ZYX")
    OmeTiffWriter.save(str_img, gt_fn, dim_order="ZYX")
    
    # extract the middle slice from each 3D image
    middle_slice_index = bf_img.shape[0] // 2
    bf_img_2d = bf_img[middle_slice_index, :, :]
    str_img_2d = str_img[middle_slice_index, :, :]
    im_fn = data_path_2d / f"{row.FOVId}_IM.tiff"
    gt_fn = data_path_2d / f"{row.FOVId}_GT.tiff"
    OmeTiffWriter.save(bf_img_2d, im_fn, dim_order="YX")
    OmeTiffWriter.save(str_img_2d, gt_fn, dim_order="YX")


In [9]:
# you may remove the download folder now.
from shutil import rmtree
import os
rmtree(raw_path_3d)
os.remove(parent_path_3d / "meta.csv")