In [None]:
#%pip install openslide-bin
#%pip install openslide-python
#%pip install geojson
#%pip install openpyxl

In [None]:
import pandas as pd
import os, glob, ast, geojson
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.colors import to_rgb
import openslide
from openslide import OpenSlide

In [None]:
# Load metadata
xls = pd.ExcelFile("/storage/homefs/ha25g949/pannet_metabolism/scripts/parallelized/240208_sample_list_DESI_RNA_coregistration.xlsx")
df_preview = pd.read_excel(xls, sheet_name=0)
coreg_IHC = pd.read_excel(xls, sheet_name=1, dtype="str")
coreg_mat = pd.read_excel(xls, sheet_name=2)

df_preview["patient_uid"] = df_preview["aPnumber"] + "_" + df_preview["B_Nr"]
print(df_preview["patient_uid"] )

# List all HE images
HE_files =  glob.glob("/storage/research/igmp_grp_perren/raw_data_DESI_imaging/PanNET_Umara/HE_and_IHC_scans/**/*[NP]_HE.mrxs", recursive=True)
HE_images = pd.DataFrame(HE_files, columns=["HE_path"])
HE_images["sample"] = HE_images["HE_path"].str.extract(r".*/([^/]+)/[^/]*[PN]_HE\.mrxs$")
print(HE_images["sample"])

# List all scan regions
geojson_files = glob.glob("/storage/research/igmp_grp_perren/raw_data_DESI_imaging/PanNET_Umara/HE_and_IHC_scans/*/*/QuPath/*HE.geojson")
scan_regions = pd.DataFrame(geojson_files, columns=["geojson_path"])
scan_regions["sample"] = scan_regions["geojson_path"].str.extract(r".*/([^/]+)/QuPath/[^/]*HE\.geojson$")

# Merge into meta data
DESI_meta_data = df_preview[~df_preview["patient_uid"].isin(HE_images["sample"])]
DESI_meta_data = df_preview.merge(HE_images, on="sample", how="left")
DESI_meta_data = DESI_meta_data.merge(scan_regions, on="sample", how="left", suffixes=('', '_geojson'))
print(DESI_meta_data)

# Prepare coregistration matrices
coreg_IHC.dropna(inplace=True, thresh=3)
coreg_IHC['normal_sample'] = coreg_IHC['normal_sample'].ffill()

# Clean and convert matrix columns
for col in ['manual_transform', 'Unnamed: 2', 'Unnamed: 3']:
    coreg_IHC[col] = coreg_IHC[col].str.replace(',', '').astype(float)

# Build matrix DataFrame
coreg_IHC_matrices = pd.DataFrame({
    'sample': coreg_IHC['normal_sample'].unique(),
    'transformation_matrix': None
})

# Fill transformation matrices
for row in coreg_IHC_matrices.index:
    sample_name = coreg_IHC_matrices.at[row, 'sample']
    matrix_data = coreg_IHC.loc[coreg_IHC['normal_sample'] == sample_name, ['manual_transform', 'Unnamed: 2', 'Unnamed: 3']].values
    coreg_IHC_matrices.at[row, 'transformation_matrix'] = matrix_data

# Drop any rows without a matrix
coreg_IHC_matrices.dropna(inplace=True)

In [None]:
def add_DESI_overlay(image_name, umap=True):
    """
    For the sample specified in image_name, this function will create GeoJSON overlays
    (cluster-based and optionally UMAP-colored) for visualization in QuPath.
    """
    try:

        # 1. Extract image ID safely
        parts = image_name.split("_")
        if len(parts) >= 13:
            image_id = "_".join(parts[10:13]).split("/")[1]
        else:
            raise ValueError(f"Image name format unexpected: {image_name}")

        # 2. Find matching cluster file
        cluster_path_pattern = "/storage/homefs/ha25g949/pannet_metabolism/parallel/rms/total/positive/frequency_0.1/leiden_clustering/" #str(glob.glob('/storage/homefs/ha25g949/pannet_metabolism/parallel/rms/total/*/frequency_0.1/leiden_clustering/'))
        #positive
        cluster_files = [
            os.path.join(cluster_path_pattern, f)
            for f in os.listdir(cluster_path_pattern)
            if f.endswith('leiden_clusters.csv') and image_id in f
        ]
        print(f"Cluster Files: {cluster_files}")
        matching_files = cluster_files

        if not matching_files:
            raise FileNotFoundError(f"No clustering files found for: {image_id}")

        cluster_csv_path = matching_files[0]
        if not isinstance(cluster_csv_path, (str, os.PathLike)) or not os.path.exists(cluster_csv_path):
            raise FileNotFoundError(f"Invalid cluster CSV path: {cluster_csv_path}")

        cluster_path = os.path.splitext(cluster_csv_path)[0]

        # 3. Get metadata for this image
        row = DESI_meta_data.loc[DESI_meta_data["sample"] == image_id]
        if row.empty:
            raise ValueError(f"Sample {image_id} not found in DESI_meta_data.")

        wsi_path = row["HE_path"].values[0]
        geojson_path = row["geojson_path"].values[0]
        DESI_pixel_size = row["pixel_size"].values[0]
        print(f"wsi_path: {wsi_path}")
        for path_name, path in [("H&E slide", wsi_path), ("GeoJSON", geojson_path)]:
            if not isinstance(path, (str, os.PathLike)) or not os.path.exists(path):
                raise FileNotFoundError(f"{path_name} not found or invalid: {path}")

        # 4. Open WSI and extract resolution
        
        slide = OpenSlide(wsi_path)
        mpp = float(slide.properties["openslide.mpp-x"])

        # 5. Load DESI area from geojson
        with open(geojson_path) as f:
            data = geojson.load(f)

        geometry = None
        for feature in data["features"]:
            if ('name', 'DESI_area') in feature["properties"].items():
                geometry = feature['geometry']
                break
        if geometry is None:
            raise ValueError(f"No 'DESI_area' found in {geojson_path}.")

        sr_coords = np.array(geometry['coordinates'])
        sr_origin = np.min(sr_coords, axis=1)
        sr_max = np.max(sr_coords, axis=1)
        new_origin = (int(sr_origin[0, 0]), int(sr_origin[0, 1]))

        # 6. Load cluster data
        cluster_data = pd.read_csv(cluster_csv_path)
        if cluster_data.empty:
            raise ValueError(f"Clustering data is empty in {cluster_csv_path}")
        if umap:
            cluster_data['cluster'] = cluster_data['cluster'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and (x.startswith("[") or x.startswith("(") or x.startswith("{")) else x)

        # 7. Compute scaling
        DESI_pixel_width = DESI_pixel_size / mpp

        first_pixel = np.array([
            [new_origin[0], new_origin[1]],
            [new_origin[0], new_origin[1] + DESI_pixel_width],
            [new_origin[0] + DESI_pixel_width, new_origin[1] + DESI_pixel_width],
            [new_origin[0] + DESI_pixel_width, new_origin[1]],
            [new_origin[0], new_origin[1]],
        ])

        x_shift = np.array([[0, DESI_pixel_width]])
        y_shift = np.array([[DESI_pixel_width, 0]])
        max_y = (sr_max[0][0] - sr_origin[0][0]) // DESI_pixel_width

        # 8. Generate overlays
        indi_clust_features = []
        umap_features = []

        for _, row in cluster_data.iterrows():
            new_roi = first_pixel.copy()
            new_roi += (x_shift * (row["x"]))
            new_roi += (y_shift * (max_y - row["y"]))
            new_roi = new_roi.astype(np.uint64)

            new_roi_list = [list(map(tuple, new_roi.tolist()))]

            # Cluster overlay
            color = tuple(np.multiply(to_rgb(row["cluster_color"]), 255))
            indi_clust_features.append(
                geojson.Feature(
                    geometry=geojson.Polygon(new_roi_list),
                    properties={
                        "objectType": "detection",
                        "classification": {"name": f"cluster_{row['cluster']}", "color": color},
                    },
                )
            )

            # Optional UMAP overlay
            if umap:
                try:
                    umap_rgb = ast.literal_eval(row["RGB_color"]) if isinstance(row["RGB_color"], str) else row["RGB_color"]
                    umap_color = tuple(np.multiply(umap_rgb, 255))
                except (ValueError, SyntaxError, TypeError) as e:
                    print(f"Failed to parse RGB color: {row['RGB_color']}, error: {e}")
                    umap_color = (128, 128, 128)  # fallback to gray
            
                umap_features.append(
                    geojson.Feature(
                        geometry=geojson.Polygon(new_roi_list),
                        properties={"objectType": "detection", "color": umap_color},
                    )
                )


        # 9. Save results
        with open(f"{cluster_path}.geojson", "w") as f:
            geojson.dump(geojson.FeatureCollection(indi_clust_features), f)

        if umap:
            with open(f"{cluster_path}_umap.geojson", "w") as f:
                geojson.dump(geojson.FeatureCollection(umap_features), f)

        print(f"Successfully created overlays for {cluster_csv_path}")

    except FileNotFoundError as e:
        print(f"File error: {e}")
    except Exception as e:
        print(f"Error processing: {e}")


In [None]:
base_path = "/storage/research/igmp_grp_perren/raw_data_DESI_imaging/PanNET_Umara/HE_and_IHC_scans"
save_path = "/storage/homefs/ha25g949/pannet_metabolism/parallel/rms/total/positive/frequency_0.1/leiden_clustering" #"/storage/homefs/ha25g949/pannet_metabolism/parallel/rms/total/negative/frequency_0.1/leiden_clustering"

#samples = [
#    "010622_B11965_TN","010622_B11965_TN","010622_B11965_TN","010622_B11965_TN","010622_B13739_NN","010622_B13739_NN","010622_B13739_NN","010622_B13739_NN","010622_B27088_TN","010622_B27088_TN","010622_B27088_TN","010622_B27088_TN","010622_B28832_TN","010622_B28832_TN","010622_B28832_TN","010622_B28832_TN","010622_B32263_TN","010622_B32263_TN","010622_B32263_TN","010622_B32263_TN","010622_B65049_TN","010622_B65049_TN","010622_B65049_TN","010622_B65049_TN","030622_B71407_NN","030622_B71407_NN","030622_B71407_NN","030622_B71407_NN","040422_B12_1795nN","040422_B12_1795nN","040422_B12_1795nN","040422_B12_1795nN","040422_B12_1795nT","040422_B12_1795nT","040422_B12_1795nT","040422_B12_1795nT","040422_B15_18042nT","040422_B15_18042nT","040422_B15_18042nT","040422_B15_18042nT","041222_B10757_TN","041222_B10757_TN","041222_B10757_TN","041222_B10757_TN","041222_B12545_TN","041222_B12545_TN","041222_B12545_TN","041222_B12545_TN","041222_B16206_TN","041222_B16206_TN","041222_B16206_TN","041222_B16206_TN","041222_B9472_TN","041222_B9472_TN","041222_B9472_TN","041222_B9472_TN","050422_B14_27003nN","050422_B14_27003nN","050422_B14_27003nN","050422_B14_27003nN","050422_B14_27003nT","050422_B14_27003nT","050422_B14_27003nT","050422_B14_27003nT","051222_B20582_TN","051222_B20582_TN","051222_B20582_TN","051222_B20582_TN","051222_B21201_TN","051222_B21201_TN","051222_B21201_TN","051222_B21201_TN","051222_B26548_TN","051222_B26548_TN","051222_B26548_TN","051222_B26548_TN","051222_B28272a_TN","051222_B28272a_TN","051222_B28272a_TN","051222_B28272a_TN","051222_B28272b_TN","051222_B28272b_TN","051222_B28272b_TN","051222_B28272b_TN","051222_B31750_TN","051222_B31750_TN","051222_B31750_TN","051222_B31750_TN","051222_B4881_TN","051222_B4881_TN","051222_B4881_TN","051222_B4881_TN","051222_B9393_TN","051222_B9393_TN","051222_B9393_TN","051222_B9393_TN","061222_B31750_NN","061222_B31750_NN","061222_B31750_NN","061222_B31750_NN","061222_B9393_NN","061222_B9393_NN","061222_B9393_NN","061222_B9393_NN","310522_B27992_TN","310522_B27992_TN","310522_B27992_TN","310522_B27992_TN","310522_B71407_TN","310522_B71407_TN","310522_B71407_TN","310522_B71407_TN"
#]

samples = [
    "031222_B11965_NP","031222_B11965_NP","031222_B11965_NP","031222_B11965_NP","031222_B31750_NP","031222_B31750_NP","031222_B31750_NP","031222_B31750_NP","040422_B14_27003T","040422_B14_27003T","040422_B14_27003T","040422_B14_27003T","070622_B41770_NP","070622_B41770_NP","070622_B41770_NP","070622_B41770_NP","070622_B71407_NP","070622_B71407_NP","070622_B71407_NP","070622_B71407_NP","140322_B12_1795N","140322_B12_1795N","140322_B12_1795N","140322_B12_1795N","140322_B12_1795T","140322_B12_1795T","140322_B12_1795T","140322_B12_1795T","150322_B14_27003N","150322_B14_27003N","150322_B14_27003N","150322_B14_27003N","150322_B15_18042T","150322_B15_18042T","150322_B15_18042T","150322_B15_18042T","201122_B10757_TP","201122_B10757_TP","201122_B10757_TP","201122_B10757_TP","201122_B12545_TP","201122_B12545_TP","201122_B12545_TP","201122_B12545_TP","201122_B16206_TP","201122_B16206_TP","201122_B16206_TP","201122_B16206_TP","201122_B20582_TP","201122_B20582_TP","201122_B20582_TP","201122_B20582_TP","201122_B21201_TP","201122_B21201_TP","201122_B21201_TP","201122_B21201_TP","201122_B26548_TP","201122_B26548_TP","201122_B26548_TP","201122_B26548_TP","201122_B28272A_TP","201122_B28272A_TP","201122_B28272A_TP","201122_B28272A_TP","201122_B28272B_TP","201122_B28272B_TP","201122_B28272B_TP","201122_B28272B_TP","201122_B31750_TP","201122_B31750_TP","201122_B31750_TP","201122_B31750_TP","201122_B4881_TP","201122_B4881_TP","201122_B4881_TP","201122_B4881_TP","201122_B9393_TP","201122_B9393_TP","201122_B9393_TP","201122_B9393_TP","201122_B9472_TP","201122_B9472_TP","201122_B9472_TP","201122_B9472_TP","310522_B11965_TP","310522_B11965_TP","310522_B11965_TP","310522_B11965_TP","310522_B27088_TP","310522_B27088_TP","310522_B27088_TP","310522_B27088_TP","310522_B27992_TP","310522_B27992_TP","310522_B27992_TP","310522_B27992_TP","310522_B28832_TP","310522_B28832_TP","310522_B28832_TP","310522_B28832_TP","310522_B32263_TP","310522_B32263_TP","310522_B32263_TP","310522_B32263_TP","310522_B65049_TP","310522_B65049_TP","310522_B65049_TP","310522_B65049_TP","310522_B71407_TP","310522_B71407_TP","310522_B71407_TP","310522_B71407_TP","91122_B9393_NP","91122_B9393_NP","91122_B9393_NP","91122_B9393_NP"
]

# List all geojson files beforehand
geojson_files = glob.glob(os.path.join(base_path, '**', 'QuPath', '**', '*HE.geojson'), recursive=True)
scan_regions = pd.DataFrame(geojson_files, columns=["geojson_path"])

for sample in samples:
    # Search recursively for folders/files that match the sample name
    matching_paths = glob.glob(os.path.join(base_path, '**', f'*{sample}*','*HE.mrxs'), recursive=True)
    
    if not matching_paths:
        print(f"No match found for {sample}")
        continue

    for complete_path in matching_paths:
        
        # Call your DESI overlay function (if needed)
        add_DESI_overlay(complete_path)
     
        # Get the geojson file associated with the current sample
        sample_geojson = scan_regions[scan_regions["geojson_path"].str.contains(sample)]

        if not sample_geojson.empty:
            geojson_path = sample_geojson["geojson_path"].values[0]
            
            # Ensure save directory exists
            os.makedirs(save_path, exist_ok=True)

            # Create unique output filename
            output_filename = f"{sample}_overlay.geojson"
            full_save_path = os.path.join(save_path, output_filename)

        else:
            print(f"No geojson file found for sample {sample}")