In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import geopandas as gpd
import pandas as pd
import tifffile

import segtraq as st

PATH = Path("/g/huber/projects/CODEX/segtraq/valid_testdata/BC_cellseg_10x/BC_Xenium_cellseg_sample")

  from pkg_resources import DistributionNotFound, get_distribution
  left = partial(_left_join_spatialelement_table)
  left_exclusive = partial(_left_exclusive_join_spatialelement_table)
  inner = partial(_inner_join_spatialelement_table)
  right = partial(_right_join_spatialelement_table)
  right_exclusive = partial(_right_exclusive_join_spatialelement_table)


# Reading files into spatialdata

Before assessing the quality of a segmentation, we first need to get the data into `spatialdata` format. The following sections show how this can be achieved from a variety of different segmentation methods.

We always require a transcript dataframe, and any additional data can come as shapes, labels, or images.

In [2]:
# reading the transcript df
transcript_df = pd.read_csv(PATH / "transcripts.csv")

# optional, if you want to add an image
image = tifffile.imread(PATH / "dapi_um.tif")

transcript_df.head()

Unnamed: 0,transcript_id,cell_id,overlaps_nucleus,feature_name,x_location,y_location,z_location,qv,fov_name,nucleus_distance,codeword_index,codeword_category,is_gene
0,282643208155248,bfnbkogm-1,0,AAMP,57.15332,163.2998,20.15625,35.75,V12,0.15625,3277,predesigned_gene,True
1,282643207975264,UNASSIGNED,0,ABCA1,69.262695,175.47168,23.5625,40.0,V12,0.9375,9629,predesigned_gene,True
2,282716222414197,nhlaipjn-1,1,ABCA1,73.387695,189.01855,21.59375,40.0,W12,0.0,9629,predesigned_gene,True
3,282643207864202,bfndjloi-1,0,ABCA1,100.74707,141.87793,20.546875,36.25,V12,0.9375,9629,predesigned_gene,True
4,282716222540753,nhlcfdpj-1,1,ABCA7,85.856445,192.45605,22.890625,40.0,W12,0.0,8133,predesigned_gene,True


If you want to load the data into a spatialdata object yourself, you can use the `create_spatialdata()` method. Alternatively, you can opt for one of our technology-specific readers, which are detailed below.

In [3]:
# example for how to create a spatialdata object yourself
sdata = st.fs.create_spatialdata(
    points=transcript_df, images=image, coord_columns=["x_location", "y_location", "z_location"]
)
sdata

[34mINFO    [0m no axes information specified in the object, setting `dims` to: [1m([0m[32m'c'[0m, [32m'y'[0m, [32m'x'[0m[1m)[0m                           


SpatialData object
├── Images
│     └── 'image': DataArray[cyx] (1, 551, 680)
└── Points
      └── 'transcripts': DataFrame with shape: (<Delayed>, 13) (3D points)
with coordinate systems:
    ▸ 'global', with elements:
        image (Images), transcripts (Points)

If you already have a spatialdata object, you can quickly assess its consistency with `validate_spatialdata()`. This will check if the cell IDs match between the transcripts and the shapes, if the labels and shapes contain the same number of cells, and a couple of other things. If everything is okay, the method will simply return `True`.

In [4]:
st.fs.validate_spatialdata(sdata)

True

## Technology Focus: ProSeg

In [5]:
shapes = gpd.read_file(PATH / "proseg_output_v3/cell-polygons-layers.geojson")

In [6]:
# for ProSeg, we need to add a mapping from the cell_id to the label_id
var_df = pd.read_csv(PATH / "proseg_output_v3/cell-metadata.csv.gz", compression="gzip")
# this dictionary maps from a cell ID (e. g. bfnbkogm-1) to a numeric ID (e. g. 1)
cell_id_dict = dict(zip(var_df["original_cell_id"], var_df["cell"], strict=False))
# adding the mapped ID into the dataframe
transcript_df["cell_id_numeric"] = transcript_df["cell_id"].map(cell_id_dict).astype("Int64")

In [7]:
sdata = st.fs.create_spatialdata(
    points=transcript_df,
    images=image,
    shapes=shapes,
    coord_columns=["x_location", "y_location", "z_location"],
    cell_key_points="cell_id_numeric",
    cell_key_shapes="cell",
    relabel_points=True,
    relabel_shapes=True,
    consolidate_shapes=True,
)
sdata

[34mINFO    [0m no axes information specified in the object, setting `dims` to: [1m([0m[32m'c'[0m, [32m'y'[0m, [32m'x'[0m[1m)[0m                           


SpatialData object
├── Images
│     └── 'image': DataArray[cyx] (1, 551, 680)
├── Points
│     └── 'transcripts': DataFrame with shape: (<Delayed>, 14) (3D points)
└── Shapes
      ├── 'cell_boundaries_layer_0': GeoDataFrame shape: (2078, 3) (2D shapes)
      ├── 'cell_boundaries_layer_1': GeoDataFrame shape: (2094, 3) (2D shapes)
      ├── 'cell_boundaries_layer_2': GeoDataFrame shape: (2094, 3) (2D shapes)
      └── 'cell_boundaries_layer_3': GeoDataFrame shape: (2072, 3) (2D shapes)
with coordinate systems:
    ▸ 'global', with elements:
        image (Images), transcripts (Points), cell_boundaries_layer_0 (Shapes), cell_boundaries_layer_1 (Shapes), cell_boundaries_layer_2 (Shapes), cell_boundaries_layer_3 (Shapes)

## Segger

In [8]:
# "/g/huber/projects/CODEX/segtraq/valid_testdata/BC_cellseg_10x/BC_Xenium_cellseg_sample/segger_output/"
transcripts = pd.read_parquet(
    PATH / "segger_output/benchmarks/segger_output_0.5_False_4_12_15_3_20250817/segger_transcripts.parquet"
)
transcripts

Unnamed: 0,transcript_id,cell_id,overlaps_nucleus,feature_name,x_location,y_location,z_location,qv,fov_name,nucleus_distance,codeword_index,codeword_category,is_gene,score,segger_cell_id,bound
0,282643207824235,UNASSIGNED,0,DeprecatedCodeword_2163,22.887695,177.737305,21.156250,40.00,V12,1.546875,2163,deprecated_codeword,False,,UNASSIGNED,
1,282643207824313,UNASSIGNED,0,ATF2,25.840820,177.190430,22.531250,40.00,V12,3.984375,6802,predesigned_gene,True,,UNASSIGNED,
2,282643207824939,UNASSIGNED,1,DeprecatedCodeword_2163,46.684570,161.659180,20.734375,40.00,V12,0.000000,2163,deprecated_codeword,False,,UNASSIGNED,
3,282643207824972,UNASSIGNED,0,DeprecatedCodeword_2163,47.481445,173.299805,24.734375,40.00,V12,1.156250,2163,deprecated_codeword,False,,UNASSIGNED,
4,282643207824999,UNASSIGNED,1,DeprecatedCodeword_2163,48.372070,165.190430,24.078125,40.00,V12,0.000000,2163,deprecated_codeword,False,,UNASSIGNED,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389540,282720518222957,nedkeclc-1,1,CD79A,666.122070,332.643555,19.718750,40.00,W13,0.000000,17359,predesigned_gene,True,0.905907,bfjhblik-1,1.0
389541,282720518222958,necpdkpf-1,0,TOX4,666.887695,367.612305,23.093750,40.00,W13,0.218750,16650,predesigned_gene,True,0.632168,necpdakf-1,1.0
389542,282720518222964,UNASSIGNED,0,UBE2D3,668.418945,325.784180,21.046875,7.50,W13,1.140625,13702,predesigned_gene,True,,UNASSIGNED,
389543,282720518222968,bfjhblik-1,1,IFIT3,669.872070,335.502930,22.953125,40.00,W13,0.000000,13477,predesigned_gene,True,0.872147,bfjhblik-1,1.0


In [None]:
# cell_shapes = gpd.read_parquet(PATH / "segger_output/benchmarks/
# segger_output_0.5_False_4_12_15_3_20250817/segger_boundaries.parquet")

In [10]:
# TODO: read nuclei

In [None]:
# TODO: these are missing
# "/g/huber/projects/CODEX/segtraq/valid_testdata/BC_cellseg_10x/BC_Xenium_cellseg_sample/segger_output/benchmarks/
# segger_output_0.5_False_4_12_15_3_20250817/"

## BIDcell

In [12]:
bidcell_path = PATH / "bidcell_output"

# reading cell labels
cell_labels_path = list(bidcell_path.glob("model_outputs/202*/test_output/epoch_4_step_100_connected.tif"))
cell_labels = tifffile.imread(cell_labels_path[0])

# reading nucleus labels
nucleus_labels = tifffile.imread(bidcell_path / "nuclei.tif")

# reading the resized image
image = tifffile.imread(bidcell_path / "dapi_resized.tif")

# reading the processed transcripts
transcripts = pd.read_csv(bidcell_path / "transcripts_processed.csv", index_col=0)

In [13]:
# as with ProSeg, we add a numeric mapping of the cell IDs directly into the transcripts column
bidcell_path = PATH / "bidcell_output"

# Table for sdata
all_files = list(bidcell_path.glob("cell_gene_matrices/202*/cell*.csv"))
if len(all_files) == 0:
    raise FileNotFoundError("No CSVs found under cell_gene_matrices/202*/cell*.csv")

dfs = [pd.read_csv(f) for f in all_files]
merged_df = pd.concat(dfs, ignore_index=True)
merged_df = merged_df.sort_values("cell_id").reset_index(drop=True)
merged_df["cell_id"] = merged_df["cell_id"].astype(int)
merged_df = merged_df.rename(
    columns={
        "cell_size": "cell_area",
        "cell_centroid_x": "centroid_x",
        "cell_centroid_y": "centroid_y",
    }
)
merged_df

Unnamed: 0,cell_id,centroid_x,centroid_y,cell_area,A2ML1,AAMP,AAR2,AARSD1,ABAT,ABCA1,...,ZPR1,ZSCAN1,ZSCAN12,ZSCAN16,ZSCAN20,ZSCAN26,ZSWIM6,ZUP1,ZYG11B,ZYX
0,1,307.135593,3.372881,1306.574394,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,298.063830,5.404255,1040.830450,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,302.214286,4.428571,310.034602,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,333.245283,7.641509,1173.702422,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,287.787879,6.333333,730.795848,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2253,2254,340.725000,545.100000,885.813149,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2254,2255,299.938776,546.836735,1085.121107,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2255,2256,319.705128,545.538462,1727.335640,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2256,2257,277.612903,546.580645,686.505190,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
sdata = st.fs.create_spatialdata(
    points=transcripts,
    labels={"cell_labels": cell_labels, "nucleus_labels": nucleus_labels},
    images=image,
    # optional, if your coordinates are not named x, y, z
    coord_columns=["x_location", "y_location", "z_location"],
)
sdata

[34mINFO    [0m no axes information specified in the object, setting `dims` to: [1m([0m[32m'y'[0m, [32m'x'[0m[1m)[0m                                
[34mINFO    [0m no axes information specified in the object, setting `dims` to: [1m([0m[32m'c'[0m, [32m'y'[0m, [32m'x'[0m[1m)[0m                           


SpatialData object
├── Images
│     └── 'image': DataArray[cyx] (1, 551, 680)
├── Labels
│     ├── 'cell_labels': DataArray[yx] (551, 680)
│     └── 'nucleus_labels': DataArray[yx] (551, 680)
└── Points
      └── 'transcripts': DataFrame with shape: (<Delayed>, 13) (3D points)
with coordinate systems:
    ▸ 'global', with elements:
        image (Images), cell_labels (Labels), nucleus_labels (Labels), transcripts (Points)

In [15]:
# we can compute the shapes as follows
sdata = st.fs.compute_shapes(sdata, labels_key="cell_labels")
sdata

SpatialData object
├── Images
│     └── 'image': DataArray[cyx] (1, 551, 680)
├── Labels
│     ├── 'cell_labels': DataArray[yx] (551, 680)
│     └── 'nucleus_labels': DataArray[yx] (551, 680)
├── Points
│     └── 'transcripts': DataFrame with shape: (<Delayed>, 13) (3D points)
└── Shapes
      └── 'cell_boundaries': GeoDataFrame shape: (2276, 2) (2D shapes)
with coordinate systems:
    ▸ 'global', with elements:
        image (Images), cell_labels (Labels), nucleus_labels (Labels), transcripts (Points), cell_boundaries (Shapes)

In [16]:
transcripts

Unnamed: 0,transcript_id,cell_id,overlaps_nucleus,feature_name,x_location,y_location,z_location,qv,fov_name,nucleus_distance,codeword_index,codeword_category,is_gene
0,282643208155248,bfnbkogm-1,0,AAMP,57.153320,163.29980,20.156250,35.75,V12,0.156250,3277,predesigned_gene,True
1,282643207975264,UNASSIGNED,0,ABCA1,69.262695,175.47168,23.562500,40.00,V12,0.937500,9629,predesigned_gene,True
2,282716222414197,nhlaipjn-1,1,ABCA1,73.387695,189.01855,21.593750,40.00,W12,0.000000,9629,predesigned_gene,True
3,282643207864202,bfndjloi-1,0,ABCA1,100.747070,141.87793,20.546875,36.25,V12,0.937500,9629,predesigned_gene,True
4,282716222540753,nhlcfdpj-1,1,ABCA7,85.856445,192.45605,22.890625,40.00,W12,0.000000,8133,predesigned_gene,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
293060,282720517240514,UNASSIGNED,0,ZNF395,618.950200,452.58105,21.093750,40.00,W13,0.468750,76,predesigned_gene,True
293061,282720517607183,nfjgchil-1,0,ZNF592,601.434600,451.25293,20.656250,40.00,W13,1.671875,12984,predesigned_gene,True
293062,282720518212562,UNASSIGNED,1,ZNF687,604.950200,459.14355,21.203125,40.00,W13,0.000000,3031,predesigned_gene,True
293063,282720517371680,UNASSIGNED,1,ZNF687,605.028300,460.28418,21.125000,30.75,W13,0.000000,3031,predesigned_gene,True


In [17]:
# TODO: FIX THE MAPPING FROM CELL TO LABEL IDS
# might want to create a method that does this
# st.fs.validate_spatialdata(sdata, cell_key_points='cell_id')

## Xenium

In [18]:
image = tifffile.imread(PATH / "dapi_um.tif")
cell_labels = tifffile.imread(PATH / "cell_mask_um.tif")
nucleus_labels = tifffile.imread(PATH / "nuc_mask_um.tif")
cell_shapes = pd.read_parquet(PATH / "cell_boundaries.parquet")
nucleus_shapes = pd.read_parquet(PATH / "nucleus_boundaries.parquet")
transcript_df = pd.read_csv(PATH / "transcripts.csv")

In [19]:
# converting the data frames into geopandas dfs
cell_shapes = st.fs.create_geopandas_df(cell_shapes)
nucleus_shapes = st.fs.create_geopandas_df(nucleus_shapes)

In [20]:
sdata = st.fs.create_spatialdata(
    points=transcripts,
    labels={"cell_labels": cell_labels, "nucleus_labels": nucleus_labels},
    images=image,
    shapes={"cell_boundaries": cell_shapes, "nucleus_boundaries": nucleus_shapes},
    # optional, if your coordinates are not named x, y, z
    coord_columns=["x_location", "y_location", "z_location"],
    consolidate_shapes=True,
)
sdata

[34mINFO    [0m no axes information specified in the object, setting `dims` to: [1m([0m[32m'y'[0m, [32m'x'[0m[1m)[0m                                
[34mINFO    [0m no axes information specified in the object, setting `dims` to: [1m([0m[32m'c'[0m, [32m'y'[0m, [32m'x'[0m[1m)[0m                           


  validate_spatialdata(


SpatialData object
├── Images
│     └── 'image': DataArray[cyx] (1, 551, 680)
├── Labels
│     ├── 'cell_labels': DataArray[yx] (551, 680)
│     └── 'nucleus_labels': DataArray[yx] (551, 680)
├── Points
│     └── 'transcripts': DataFrame with shape: (<Delayed>, 13) (3D points)
└── Shapes
      ├── 'cell_boundaries': GeoDataFrame shape: (2198, 2) (2D shapes)
      └── 'nucleus_boundaries': GeoDataFrame shape: (2195, 2) (2D shapes)
with coordinate systems:
    ▸ 'global', with elements:
        image (Images), cell_labels (Labels), nucleus_labels (Labels), transcripts (Points), cell_boundaries (Shapes), nucleus_boundaries (Shapes)

In [21]:
# ensuring that the new spatialdata is consistent
st.fs.validate_spatialdata(sdata, cell_key_points="cell_id")

  st.fs.validate_spatialdata(sdata, cell_key_points="cell_id")


True