In [15]:
#!/usr/bin/env python3
import scanpy as sc
import pandas as pd
import geopandas as gpd
import pickle
import anndata
from pathlib import Path
from shapely.geometry import Polygon, Point
from scipy import sparse
import re
import matplotlib.pyplot as plt
import numpy as np


TODO:

1. Figure out the discrepancy source of polynomial ID in the adata file and that of the mapping file. There shouldn't be a discrepancy. Seems good overall though

In [16]:
# constants
BASE_DIR = Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/"
                "dietary_droject/data/Rose_Li_VisiumHD")
SEG_PATH = Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/"
                "dietary_droject/data/cell_segmentation")
SAMPLES = [
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07833_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07834_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07835_22WJCYLT3_swapped",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07836_22WJCYLT3_swapped",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07837_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07838_22WJCYLT3",
]

# Initialize a list to collect summary stats
umi_stats = []

# To make sure cells don't have a repeat ID
offset = 0

#Initializing list for UMI summary statistics
for sample in SAMPLES:
    sample_id = re.search(r'F\d{5}', sample).group(0)
    print(f"=== Rebinning {sample_id} ===\n")
    print(f"offset value starting: {offset}")
    
    #==============================================================================================#
    # load StarDist polygons
    #==============================================================================================#
    pkl_file = SEG_PATH / sample_id / "model_output" / f"{sample_id}_polys.pkl"
    with open(pkl_file, "rb") as f:
        polys = pickle.load(f)


    # Creating a list to store Polygon geometries
    geometries = []
    
    # Iterating through each nuclei in the 'polys' DataFrame
    for nuclei in range(len(polys['coord'])):
        # Extracting coordinates for the current nuclei and converting them to (y, x) format
        coords = [(y, x) for x, y in zip(polys['coord'][nuclei][0], polys['coord'][nuclei][1])]
        # Creating a Polygon geometry from the coordinates
        geometries.append(Polygon(coords))
    # Creating a GeoDataFrame using the Polygon geometries
    gdf = gpd.GeoDataFrame(geometry=geometries)
    gdf['id']   = [f"ID_{offset + i + 1}" for i, _ in enumerate(gdf.index)]
    gdf['area'] = gdf.geometry.area
    offset += len(gdf)

    #==============================================================================================#
    # load original Visium HD data
    #==============================================================================================#
    spatial_dir = BASE_DIR / sample / "outs" / "binned_outputs" / "square_002um"
    raw_h5_file  = spatial_dir / "filtered_feature_bc_matrix.h5"
    pq   = spatial_dir / "spatial" / "tissue_positions.parquet"
    
    adata = sc.read_10x_h5(str(raw_h5_file ))
    adata.var_names_make_unique()
    
    df_tissue_positions = pd.read_parquet(str(pq))
    df_tissue_positions = df_tissue_positions.set_index("barcode")
    df_tissue_positions['index'] = df_tissue_positions.index
    adata.obs = pd.merge(adata.obs, df_tissue_positions, left_index=True, right_index=True)

    #==============================================================================================#
    # create GeoDataFrame of barcodes
    #==============================================================================================#
    
    geometry = [Point(xy) for xy in zip(
        df_tissue_positions['pxl_col_in_fullres'], df_tissue_positions['pxl_row_in_fullres']
    )]
    gdf_coordinates = gpd.GeoDataFrame(df_tissue_positions, geometry=geometry)

    #==============================================================================================#
    # spatial join and filtering
    #==============================================================================================#
    # Perform a spatial join to check which coordinates are in a cell nucleus
    result_spatial_join = gpd.sjoin(
        gdf_coordinates, gdf, how='left', predicate='within'
    )
    
    # Identify nuclei associated barcodes and find barcodes that are in more than one nucleus
    result_spatial_join['is_within_polygon'] = ~result_spatial_join['index_right'].isna()
    barcodes_in_overlaping_polygons  = pd.unique(
        result_spatial_join[result_spatial_join.duplicated(subset=['index'])]['index']
    )
    result_spatial_join['is_not_in_an_polygon_overlap'] = ~result_spatial_join['index'].isin(barcodes_in_overlaping_polygons)


    # Remove barcodes in overlapping nuclei
    barcodes_in_one_polygon = result_spatial_join[
        result_spatial_join['is_within_polygon'] & 
        result_spatial_join['is_not_in_an_polygon_overlap']
    ]

    # The AnnData object is filtered to only contain the barcodes that are in non-overlapping polygon regions
    filtered_obs_mask = adata.obs_names.isin(barcodes_in_one_polygon['index'])
    filtered_adata = adata[filtered_obs_mask,:]

    # Save barcode -> nucleus ID mapping
    barcode_nucleus_mapping = barcodes_in_one_polygon[['index', 'id']].copy()
    barcode_nucleus_mapping.to_csv(SEG_PATH / sample_id / f"{sample_id}_barcode_to_nucleus_mapping.csv", index=False)
    print(f"Saved barcode-to-nucleus mapping for {sample_id}")


    # Add the results of the point spatial join to the Anndata object
    filtered_adata.obs = pd.merge(
        filtered_adata.obs, 
        barcodes_in_one_polygon[['index','geometry','id','is_within_polygon','is_not_in_an_polygon_overlap']], 
        left_index=True, right_index=True)
    
    #==============================================================================================#
    # summation
    #==============================================================================================#
    # Group the data by unique nucleous IDs
    groupby_object = filtered_adata.obs.groupby(['id'], observed=True)

    # Extract the gene expression counts from the AnnData object
    counts = filtered_adata.X

    # Obtain the number of unique nuclei and the number of genes in the expression data
    N_groups = groupby_object.ngroups
    N_genes = counts.shape[1]

    # Initialize a sparse matrix to store the summed gene counts for each nucleus
    summed_counts = sparse.lil_matrix((N_groups, N_genes))

    # Lists to store the IDs of polygons and the current row index
    polygon_id = []
    row = 0

    # Iterate over each unique polygon to calculate the sum of gene counts.
    for polygons, idx_ in groupby_object.indices.items():
        summed_counts[row] = counts[idx_].sum(0)
        row += 1
        polygon_id.append(polygons)
        

    # Create and AnnData object from the summed count matrix
    summed_counts = summed_counts.tocsr()
    rebinned_adata = anndata.AnnData(
        X=summed_counts,
        obs=pd.DataFrame(polygon_id, columns=['id'], index=polygon_id),
        var=filtered_adata.var
    )
    %store rebinned_adata
    
    #==============================================================================================#
    # save with "_rebinned" suffix
    #==============================================================================================#
    out_dir = SEG_PATH / sample_id
    rebinned_adata.write(out_dir / f"{sample_id}_grouped_filtered_adata_rebinned.h5ad")
    gdf.to_file(out_dir / f"{sample_id}_gdf_rebinned.gpkg", driver="GPKG")
    
    print(f"→ Saved rebinned for {sample_id}")
    
    #===========================================================================#
    # UMI Figure
    #===========================================================================#

    # ensure your figures folder exists
    fig_out = SEG_PATH / sample_id / "figures"
    fig_out.mkdir(parents=True, exist_ok=True)
    
    # Compute total UMI per nucleus from the rebinned AnnData
    total_umis = rebinned_adata.X.sum(axis=1).A1   # flatten sparse matrix
    rebinned_adata.obs['total_umis'] = total_umis
    
    # Merge UMI counts into the rebinned GeoDataFrame (gdf)
    gdf_umi = gdf.merge(
        rebinned_adata.obs[['total_umis']],
        left_on='id',
        right_index=True
    )

    # Fig
    fig, ax = plt.subplots(figsize=(20, 20))
    
    # Draw bounding box
    xmin, ymin, xmax, ymax = gdf_umi.total_bounds
    ax.plot(
        [xmin, xmax, xmax, xmin, xmin],  # x coordinates (loop closed)
        [ymin, ymin, ymax, ymax, ymin],  # y coordinates
        color="cyan", linewidth=2, linestyle="--", label="Bounding Box"
    )
    
    # Plot with color scale capped
    gdf_umi.plot(
        column='total_umis',
        cmap='inferno',
        legend=True,
        linewidth=0.1,
        edgecolor='black',
        ax=ax,
        vmin=0,    # start colorbar at 0
        vmax=300   # cap colorbar at 200
    )
    
    # Set title and enable axis coordinates
    ax.set_title(f"UMI Counts per Nucleus (rebinned {sample_id})", fontsize=18)
    ax.set_xlabel("X Coordinate (pixels)")
    ax.set_ylabel("Y Coordinate (pixels)")
    ax.legend()
    ax.axis('on')  # Turn axes back ON
    
    # Tight layout and save
    plt.tight_layout()
    out_file = fig_out / f"{sample_id}_umi_rebinned.png"
    fig.savefig(out_file, dpi=600, bbox_inches='tight')
    plt.close(fig)

    print(f"→ Saved UMI map with bounding box: {out_file}")

    # ===========================================
    # Now make second figure: filtered at UMI > 50
    # ===========================================

    # Filter gdf_umi to only nuclei with total_umis > 50
    gdf_umi_filtered = gdf_umi[gdf_umi['total_umis'] > 50]
    
    # Create new figure
    fig2, ax2 = plt.subplots(figsize=(20, 20))
    
    # Plot filtered data
    gdf_umi_filtered.plot(
        column='total_umis',
        cmap='inferno',
        legend=True,
        linewidth=0.1,
        edgecolor='black',
        ax=ax2,
        vmin=50,
        vmax=300
    )
    
    # Draw bounding box (same xmin, xmax, ymin, ymax as before)
    ax2.plot(
        [xmin, xmax, xmax, xmin, xmin],
        [ymin, ymin, ymax, ymax, ymin],
        color="cyan", linewidth=2, linestyle="--", label="Bounding Box"
    )
    
    # Set title and labels
    ax2.set_title(f"UMI > 50 per Nucleus (rebinned {sample_id})", fontsize=18)
    ax2.set_xlabel("X Coordinate (pixels)")
    ax2.set_ylabel("Y Coordinate (pixels)")
    ax2.legend()
    ax2.axis('on')
    
    # Save second figure
    plt.tight_layout()
    out_file2 = fig_out / f"{sample_id}_umi_rebinned_filtered.png"
    fig2.savefig(out_file2, dpi=600, bbox_inches='tight')
    plt.close(fig2)
    
    print(f"→ Saved UMI map (filtered) with bounding box: {out_file2}")

    umi_values = rebinned_adata.obs['total_umis']
    stats = {
        'sample_id': sample_id,
        'n_nuclei': len(umi_values),
        'umi_min': np.min(umi_values),
        'umi_q1': np.percentile(umi_values, 25),
        'umi_median': np.median(umi_values),
        'umi_mean': np.mean(umi_values),
        'umi_q3': np.percentile(umi_values, 75),
        'umi_max': np.max(umi_values),
        'umi_std': np.std(umi_values)
    }
    umi_stats.append(stats)

    print(f"Analysis complete for {sample_id} \n")
    print(f"offset value ending: {offset}")
        
    

=== Rebinning F07833 ===

offset value starting: 0


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Saved barcode-to-nucleus mapping for F07833


  db[ 'autorestore/' + arg ] = obj


Stored 'rebinned_adata' (AnnData)


  write(


→ Saved rebinned for F07833
→ Saved UMI map with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07833/figures/F07833_umi_rebinned.png
→ Saved UMI map (filtered) with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07833/figures/F07833_umi_rebinned_filtered.png
Analysis complete for F07833 

offset value ending: 281723
=== Rebinning F07834 ===

offset value starting: 281723


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Saved barcode-to-nucleus mapping for F07834


  db[ 'autorestore/' + arg ] = obj


Stored 'rebinned_adata' (AnnData)


  write(


→ Saved rebinned for F07834
→ Saved UMI map with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07834/figures/F07834_umi_rebinned.png
→ Saved UMI map (filtered) with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07834/figures/F07834_umi_rebinned_filtered.png
Analysis complete for F07834 

offset value ending: 546384
=== Rebinning F07835 ===

offset value starting: 546384


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Saved barcode-to-nucleus mapping for F07835


  db[ 'autorestore/' + arg ] = obj


Stored 'rebinned_adata' (AnnData)


  write(


→ Saved rebinned for F07835
→ Saved UMI map with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07835/figures/F07835_umi_rebinned.png
→ Saved UMI map (filtered) with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07835/figures/F07835_umi_rebinned_filtered.png
Analysis complete for F07835 

offset value ending: 856466
=== Rebinning F07836 ===

offset value starting: 856466


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Saved barcode-to-nucleus mapping for F07836


  db[ 'autorestore/' + arg ] = obj


Stored 'rebinned_adata' (AnnData)


  write(


→ Saved rebinned for F07836
→ Saved UMI map with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07836/figures/F07836_umi_rebinned.png
→ Saved UMI map (filtered) with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07836/figures/F07836_umi_rebinned_filtered.png
Analysis complete for F07836 

offset value ending: 1144565
=== Rebinning F07837 ===

offset value starting: 1144565


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Saved barcode-to-nucleus mapping for F07837


  db[ 'autorestore/' + arg ] = obj


Stored 'rebinned_adata' (AnnData)


  write(


→ Saved rebinned for F07837
→ Saved UMI map with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07837/figures/F07837_umi_rebinned.png
→ Saved UMI map (filtered) with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07837/figures/F07837_umi_rebinned_filtered.png
Analysis complete for F07837 

offset value ending: 1440475
=== Rebinning F07838 ===

offset value starting: 1440475


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Saved barcode-to-nucleus mapping for F07838


  db[ 'autorestore/' + arg ] = obj


Stored 'rebinned_adata' (AnnData)


  write(


→ Saved rebinned for F07838
→ Saved UMI map with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07838/figures/F07838_umi_rebinned.png
→ Saved UMI map (filtered) with bounding box: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/F07838/figures/F07838_umi_rebinned_filtered.png
Analysis complete for F07838 

offset value ending: 1648501


In [7]:
offset

0

In [11]:
# Save UMI stats summary table
umi_stats_df = pd.DataFrame(umi_stats)
summary_file = SEG_PATH / "umi_summary_rebinned.csv"
umi_stats_df.to_csv(summary_file, index=False)
print(f"→ Saved UMI summary table: {summary_file}")

→ Saved UMI summary table: /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/cell_segmentation/umi_summary_rebinned.csv


In [9]:
bounds = gdf_umi.total_bounds
xmin, ymin, xmax, ymax = bounds

print(f"xmin: {xmin}")
print(f"xmax: {xmax}")
print(f"ymin: {ymin}")
print(f"ymax: {ymax}")


xmin: 2033.7506103515625
xmax: 5859.31640625
ymin: 2374.890380859375
ymax: 5647.9951171875


In [3]:
# Helper function
def print_bounds(name, bounds_array):
    xmin, ymin, xmax, ymax = bounds_array
    print(f"{name}:")
    print(f"  xmin = {xmin:.2f}")
    print(f"  xmax = {xmax:.2f}")
    print(f"  ymin = {ymin:.2f}")
    print(f"  ymax = {ymax:.2f}")
    print("-" * 40)

# 1. polys
# polys is just coordinates, so manually calculate
all_x = []
all_y = []
for coord in polys["coord"]:
    ys, xs = coord
    all_x.extend(xs)
    all_y.extend(ys)

xmin_poly = min(all_x)
xmax_poly = max(all_x)
ymin_poly = min(all_y)
ymax_poly = max(all_y)
print_bounds("polys", [xmin_poly, ymin_poly, xmax_poly, ymax_poly])

# 2. df_tissue_positions
xmin_tp = df_tissue_positions["pxl_col_in_fullres"].min()
xmax_tp = df_tissue_positions["pxl_col_in_fullres"].max()
ymin_tp = df_tissue_positions["pxl_row_in_fullres"].min()
ymax_tp = df_tissue_positions["pxl_row_in_fullres"].max()
print_bounds("df_tissue_positions_parquet", [xmin_tp, ymin_tp, xmax_tp, ymax_tp])

# 3. gdf_coordinates
print_bounds("gdf_coordinates", gdf_coordinates.total_bounds)

# 4. barcodes_in_one_polygon
# First merge back into coordinates to get geometry
barcodes_df = gdf_coordinates[gdf_coordinates.index.isin(barcodes_in_one_polygon["index"])]
print_bounds("barcodes_in_one_polygon", barcodes_df.total_bounds)

# 5. filtered_adata
# filtered_adata.obs contains pxl_col_in_fullres and pxl_row_in_fullres from earlier merge
xmin_filt = filtered_adata.obs["pxl_col_in_fullres"].min()
xmax_filt = filtered_adata.obs["pxl_col_in_fullres"].max()
ymin_filt = filtered_adata.obs["pxl_row_in_fullres"].min()
ymax_filt = filtered_adata.obs["pxl_row_in_fullres"].max()
print_bounds("filtered_adata", [xmin_filt, ymin_filt, xmax_filt, ymax_filt])


NameError: name 'polys' is not defined

In [None]:
# The image being used doesn't match

In [5]:
from PIL import Image
import numpy as np

img_path = "/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/Rose_Li_VisiumHD/BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07834_22WJCYLT3/outs/binned_outputs/square_002um/spatial/tissue_hires_image.png"
# img_path = "/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide1.tif"
img = Image.open(img_path)
img_np = np.array(img)
print("Image shape (height, width, channels):", img_np.shape)


Image shape (height, width, channels): (5870, 6000, 3)


In [4]:
import tifffile

img_path = "/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide1.tif"

img = tifffile.imread(img_path)
print("TIFF image shape:", img.shape)


TIFF image shape: (12536, 12814, 3)


In [3]:
HNE_TIF_PATHS = {
    "F07833": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide2.tif"),
    "F07834": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide1.tif"),
    "F07835": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/120524-120624_MycCap_TMA1_1_TMA3_1/hne/MycCap_TMA1_slide1.tif"),
    "F07836": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/120524-120624_MycCap_TMA1_1_TMA3_1/hne/RL_MycCap_TMA3_slide1.tif"),
    "F07837": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/121124-121224_SKO_TMA1_1_n_TMA2_1/hne/12_11_2024_RL_SKOTMA1_Slide_1.tif"),
    "F07838": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/121124-121224_SKO_TMA1_1_n_TMA2_1/hne/12_11_2024_RL_SKOTMA2 Slide_1.tif"),
}

for sample_id, path in HNE_TIF_PATHS.items():
    if path.exists():
        print(f"✅ {sample_id}: Found {path}")
    else:
        print(f"❌ {sample_id}: MISSING {path}")


✅ F07833: Found /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide2.tif
✅ F07834: Found /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide1.tif
✅ F07835: Found /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/120524-120624_MycCap_TMA1_1_TMA3_1/hne/MycCap_TMA1_slide1.tif
✅ F07836: Found /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/120524-120624_MycCap_TMA1_1_TMA3_1/hne/RL_MycCap_TMA3_slide1.tif
✅ F07837: Found /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/121124-121224_SKO_TMA1_1_n_TMA2_1/hne/12_11_2024_RL_SKOTMA1_Slide_1.tif
✅ F07838: Found /mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/121124-121224_SKO_TMA1_1_n_TMA2

In [1]:
import pandas as pd
import tifffile
from shapely.geometry import Point
import geopandas as gpd
from pathlib import Path

# Paths
BASE_DIR = Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/Rose_Li_VisiumHD")
HNE_TIF_PATHS = {
    "F07833": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide2.tif"),
    "F07834": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/111324-111424_Training_TMA2/hne/MycCAP_TMA2_Slide1.tif"),
    "F07835": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/120524-120624_MycCap_TMA1_1_TMA3_1/hne/MycCap_TMA1_slide1.tif"),
    "F07836": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/120524-120624_MycCap_TMA1_1_TMA3_1/hne/RL_MycCap_TMA3_slide1.tif"),
    "F07837": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/121124-121224_SKO_TMA1_1_n_TMA2_1/hne/12_11_2024_RL_SKOTMA1_Slide_1.tif"),
    "F07838": Path("/mnt/c/Users/jonan/Documents/1Work/RoseLab/Spatial/dietary_droject/data/images_for_alignments/121124-121224_SKO_TMA1_1_n_TMA2_1/hne/12_11_2024_RL_SKOTMA2 Slide_1.tif"),
}

# Matching samples
SAMPLES = [
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07833_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07834_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07835_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07836_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07837_22WJCYLT3",
    "BANOSSM_SSM0015_1_PR_Whole_C1_VISHD_F07838_22WJCYLT3",
]

print("\n=== Checking Parquet Coordinates and TIFF Image Shapes ===\n")

for sample in SAMPLES:
    sample_id = sample.split("_F")[-1].split("_")[0]  # F07833, etc.

    # --- Load parquet file ---
    pq_path = BASE_DIR / sample / "outs" / "binned_outputs" / "square_002um" / "spatial" / "tissue_positions.parquet"
    if pq_path.exists():
        df_pos = pd.read_parquet(pq_path).set_index("barcode")
        geometry = [Point(xy) for xy in zip(df_pos["pxl_col_in_fullres"], df_pos["pxl_row_in_fullres"])]
        gdf_coords = gpd.GeoDataFrame(df_pos, geometry=geometry)
        xmin, ymin, xmax, ymax = gdf_coords.total_bounds
        print(f"📄 {sample_id} Parquet: xmin={xmin:.2f}, xmax={xmax:.2f}, ymin={ymin:.2f}, ymax={ymax:.2f}")
    else:
        print(f"❌ {sample_id} Parquet missing at {pq_path}")

    # --- Load tif image ---
    tif_path = HNE_TIF_PATHS.get(sample_id)
    if tif_path and tif_path.exists():
        img = tifffile.imread(str(tif_path))
        print(f"🖼️  {sample_id} TIF shape: {img.shape}\n")
    else:
        print(f"❌ {sample_id} TIF missing at {tif_path}\n")



=== Checking Parquet Coordinates and TIFF Image Shapes ===

📄 07833 Parquet: xmin=933.89, xmax=10739.15, ymin=1045.67, ymax=10850.66
❌ 07833 TIF missing at None



KeyboardInterrupt: 