The data is from the hiPSC single cell image dataset from the Allen Institute for Cell Science: https://open.quiltdata.com/b/allencell/packages/aics/hipsc_single_cell_image_dataset. Original publication of the data: 

Viana, M.P., Chen, J., Knijnenburg, T.A. et al. Integrated intracellular organization and its variations in human iPS cells. Nature 613, 345–354 (2023). https://doi.org/10.1038/s41586-022-05563-7

This demo will gather data to demonstrate 2D unsupervised segmentation of tight junction (TJP1).

In [None]:
# you need quilt3 package to download the data:
! pip install quilt3

In [1]:
import pandas as pd
import quilt3
from pathlib import Path
from aicsimageio import AICSImage
from aicsimageio.writers import OmeTiffWriter
from random import random, sample
from shutil import move
import numpy as np

In [7]:
# we use the TJP1 cell line for this demo
cline = "TJP1"

# set up path
parent_path = Path("../../data/unsupervise2D")

raw_path = parent_path / Path("download")
raw_path.mkdir(exist_ok=True)
train_path = parent_path / Path("train")
train_path.mkdir(exist_ok=True)
holdout_path = parent_path / Path("holdout")
holdout_path.mkdir(exist_ok=True)
tmp_path = parent_path / Path("tmp")
tmp_path.mkdir(exist_ok=True)

In [9]:
# connect to quilt and load meta table
pkg = quilt3.Package.browse(
    "aics/hipsc_single_cell_image_dataset", registry="s3://allencell"
)
meta_df_obj = pkg["metadata.csv"]
meta_df_obj.fetch(parent_path / "meta.csv")
meta_df = pd.read_csv(parent_path / "meta.csv")

# fetch the data of the specific cell line
meta_df_line = meta_df.query("structure_name==@cline")

# collapse the data table based on FOVId
meta_df_line.drop_duplicates(subset="FOVId", inplace=True)

# reset index
meta_df_line.reset_index(drop=True, inplace=True)

Loading manifest: 100%|██████████| 484465/484465 [00:12<00:00, 37.9k/s]
100%|██████████| 1.69G/1.69G [00:44<00:00, 38.4MB/s] 
  meta_df = pd.read_csv(parent_path / "meta.csv")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df_line.drop_duplicates(subset="FOVId", inplace=True)


In [11]:
# download the images and segmentation
# we only need a small amount of data for the purpose of demonstration
num_of_sample = 100
tmp_list = []
for row in meta_df_line.itertuples():
    if row.Index > num_of_sample:
        break
    # fetch the raw image (multi-channel)
    subdir_name = row.fov_path.split("/")[0]
    file_name = row.fov_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_original.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure channel
    reader = AICSImage(local_fn)
    img = reader.get_image_data("ZYX", C=row.ChannelNumberStruct, S=0, T=0)
    img_proj = np.amax(img, axis=0)

    # fetch structure segmentation
    subdir_name = row.struct_seg_path.split("/")[0]
    file_name = row.struct_seg_path.split("/")[1]

    local_fn = raw_path / f"{row.FOVId}_seg.tiff"
    pkg[subdir_name][file_name].fetch(local_fn)

    # extract the structure segmentation
    reader = AICSImage(local_fn)
    seg = reader.get_image_data("ZYX", C=0, S=0, T=0).astype(np.uint8)
    seg[seg > 0] = 1
    seg_proj = np.amax(seg, axis=0)

    if random() < 0.2:
        # save as holdout
        im_fn = holdout_path / f"{row.FOVId}_IM.tiff"
        gt_fn = holdout_path / f"{row.FOVId}_GT.tiff"
        OmeTiffWriter.save(img_proj, im_fn, dim_order="YX")
        OmeTiffWriter.save(seg_proj, gt_fn, dim_order="YX")
    else:
        # save the grayscale image in the train path and save the
        # corresponding segmentation in tmp, and shuffle at the end
        im_fn = train_path / f"{row.FOVId}_IM.tiff"
        gt_fn = tmp_path / f"{row.FOVId}_GT.tiff"
        tmp_list.append(row.FOVId)
        OmeTiffWriter.save(img_proj, im_fn, dim_order="YX")
        OmeTiffWriter.save(seg_proj, gt_fn, dim_order="YX")

100%|██████████| 327M/327M [00:17<00:00, 18.3MB/s]  
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 124k/124k [00:04<00:00, 28.7kB/s]
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 323M/323M [00:16<00:00, 20.0MB/s]  
100%|██████████| 128k/128k [00:03<00:00, 40.0kB/s]
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 322M/322M [00:14<00:00, 21.9MB/s]  
100%|██████████| 121k/121k [00:03<00:00, 36.7kB/s]
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|██████████| 327M/327M [00:14<00:00, 23.2MB/s]  
100%|██████████| 122k/122k [00:03<00:00, 39.7kB/s]
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
100%|███

In [12]:
new_list = sample(tmp_list, len(tmp_list))
for old_id, new_id in zip(tmp_list, new_list):
    src_fn = tmp_path / f"{old_id}_GT.tiff"
    tar_fn = train_path / f"{new_id}_GT.tiff"
    move(src_fn, tar_fn)

In [13]:
# you may remove the download folder now.
from shutil import rmtree
import os
rmtree(raw_path)
rmtree(tmp_path)
os.remove(parent_path / "meta.csv")
