In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import anndata
import geopandas as gpd
import scanpy as sc
from PIL import Image # Converting image
import os # for parquet file
from pathlib import Path

# from tifffile import imread, imwrite
# from csbdeep.utils import normalize
# from stardist.models import StarDist2D
from shapely.geometry import Polygon, Point
from scipy import sparse
from matplotlib.colors import ListedColormap
from matplotlib.widgets import LassoSelector
from matplotlib.path import Path as MplPath 
%matplotlib widget

In [3]:
SEGMENTATION_PATH = Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/"
                         "dietary_droject/data/cell_segmentation")

sample_id = "F07833"

adata_path = SEGMENTATION_PATH / sample_id / f"{sample_id}_grouped_filtered_adata_rebinned.h5ad"
gdf_file = SEGMENTATION_PATH / sample_id / f"{sample_id}_gdf_rebinned.gpkg"

# Load adata
ST_sample = sc.read_h5ad(adata_path)
# load your polygons
geo_file = gpd.read_file(gdf_file)

In [4]:
print(ST_sample.obs.columns) # shows columns in observation metadata (cells/spots)
print(ST_sample.var.columns) # shows columns in variable metadata (genes)

Index(['id'], dtype='object')
Index(['gene_ids', 'feature_types', 'genome'], dtype='object')


In [5]:
print(geo_file.columns)

Index(['id', 'area', 'geometry'], dtype='object')


In [14]:
ST_sample.obs['id']

ID_10            ID_10
ID_100          ID_100
ID_1000        ID_1000
ID_10000      ID_10000
ID_100000    ID_100000
               ...    
ID_99993      ID_99993
ID_99994      ID_99994
ID_99995      ID_99995
ID_99997      ID_99997
ID_99999      ID_99999
Name: id, Length: 216518, dtype: object

In [17]:
# 1. Grab ID strings
id_strings = ST_sample.obs['id']

# 2. Strip "ID_" and convert to integers
id_numbers = id_strings.str.replace("ID_", "").astype(int)

# 3. Compute summary statistics
id_summary = {
    "n_ids": len(id_numbers),
    "id_min": np.min(id_numbers),
    "id_q1": np.percentile(id_numbers, 25),
    "id_median": np.median(id_numbers),
    "id_mean": np.mean(id_numbers),
    "id_q3": np.percentile(id_numbers, 75),
    "id_max": np.max(id_numbers),
    "id_std": np.std(id_numbers),
    "id_iqr": np.percentile(id_numbers, 75) - np.percentile(id_numbers, 25)
}

# 4. Show nicely
summary_df = pd.DataFrame([id_summary])
print(summary_df.T)


                       0
n_ids      216518.000000
id_min          5.000000
id_q1       68713.500000
id_median  138935.500000
id_mean    139613.980939
id_q3      210232.750000
id_max     281723.000000
id_std      81568.878439
id_iqr     141519.250000


In [19]:
id_numbers.mode()

0              5
1              6
2              8
3              9
4             10
           ...  
216513    281718
216514    281719
216515    281720
216516    281721
216517    281723
Name: id, Length: 216518, dtype: int64

In [16]:

ST_sample.obs['id'].value_counts()


id
ID_99999     1
ID_10        1
ID_100       1
ID_1000      1
ID_10000     1
            ..
ID_100008    1
ID_100007    1
ID_100006    1
ID_100003    1
ID_100002    1
Name: count, Length: 216518, dtype: int64

In [7]:
ST_sample.var.head()

Unnamed: 0,gene_ids,feature_types,genome
Xkr4,ENSMUSG00000051951,Gene Expression,mm10
Rp1,ENSMUSG00000025900,Gene Expression,mm10
Sox17,ENSMUSG00000025902,Gene Expression,mm10
Lypla1,ENSMUSG00000025903,Gene Expression,mm10
Tcea1,ENSMUSG00000033813,Gene Expression,mm10


In [8]:
ST_sample.uns.keys()  # unstructured data (e.g., clustering results, metadata)

odict_keys([])

In [9]:
ST_sample.obsm.keys() # multidimensional annotations (e.g., PCA, UMAP coordinates)

KeysView(AxisArrays with keys: )

In [10]:
ST_sample.varm.keys()

KeysView(AxisArrays with keys: )

# Exploring geo_file

In [21]:
SEGMENTATION_PATH = Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/"
                         "dietary_droject/data/cell_segmentation")

sample_id = "F07833"

# adata_path = SEGMENTATION_PATH / sample_id / f"{sample_id}_grouped_filtered_adata_rebinned.h5ad"
gdf_file = SEGMENTATION_PATH / sample_id / f"{sample_id}_gdf_rebinned.gpkg"

# Load adata
# ST_sample = sc.read_h5ad(adata_path)
# load your polygons
geo_file = gpd.read_file(gdf_file)

# Check ID and barcode mappings

In [24]:
# Checking the ID matching
import pandas as pd

SEGMENTATION_PATH = Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/"
                         "dietary_droject/data/cell_segmentation")

sample_id = "F07833"

# Load your AnnData (filtered and rebinned version)
import scanpy as sc
ST_sample = sc.read_h5ad(SEGMENTATION_PATH / sample_id /f"{sample_id}_grouped_filtered_adata_rebinned.h5ad")
# Load your saved barcode-to-nucleus mapping
barcode_to_nucleus = pd.read_csv( SEGMENTATION_PATH / sample_id / f"{sample_id}_barcode_to_nucleus_mapping.csv")

In [25]:
# Extract IDs from ST_sample
ids_from_adata = set(ST_sample.obs['id'])

# Extract IDs from CSV
ids_from_csv = set(barcode_to_nucleus['id'])

# sanity checks:
missing_in_csv = ids_from_adata - ids_from_csv
missing_in_adata = ids_from_csv - ids_from_adata

print(f"Number of IDs in ST_sample: {len(ids_from_adata)}")
print(f"Number of IDs in CSV: {len(ids_from_csv)}")

if len(missing_in_csv) == 0 and len(missing_in_adata) == 0:
    print("✅ Perfect match: All IDs match between ST_sample and the CSV!")
else:
    print("❌ Mismatch detected:")
    if missing_in_csv:
        print(f" - IDs in ST_sample but missing from CSV: {missing_in_csv}")
    if missing_in_adata:
        print(f" - IDs in CSV but missing from ST_sample: {missing_in_adata}")

Number of IDs in ST_sample: 216518
Number of IDs in CSV: 216578
❌ Mismatch detected:
 - IDs in CSV but missing from ST_sample: {'ID_2309', 'ID_95063', 'ID_147850', 'ID_144575', 'ID_81345', 'ID_233710', 'ID_141685', 'ID_218323', 'ID_224233', 'ID_7804', 'ID_32781', 'ID_233267', 'ID_157798', 'ID_39639', 'ID_276835', 'ID_252433', 'ID_2', 'ID_1181', 'ID_190638', 'ID_180609', 'ID_53657', 'ID_90822', 'ID_266508', 'ID_278767', 'ID_253977', 'ID_194227', 'ID_28641', 'ID_260392', 'ID_210114', 'ID_26761', 'ID_129647', 'ID_271074', 'ID_175809', 'ID_184614', 'ID_272505', 'ID_196795', 'ID_146187', 'ID_260190', 'ID_207337', 'ID_253223', 'ID_253053', 'ID_197586', 'ID_263387', 'ID_449', 'ID_246860', 'ID_42258', 'ID_245044', 'ID_30802', 'ID_229150', 'ID_2187', 'ID_172435', 'ID_119020', 'ID_83355', 'ID_60796', 'ID_94443', 'ID_280549', 'ID_208474', 'ID_260134', 'ID_57794', 'ID_272197'}
