In [1]:
import pandas as pd
import json
import os
import scanpy as sc
import numpy as np
import logging
from hest.HESTData import iter_hest
from typing import List

In [4]:
# Helper: save JSON with length in filename
def save_json(filename_prefix, genes):
    filename = f"{filename_prefix}_genes.json"
    output_folder = '/project/simmons_hts/kxu/hest/eval/data/XeniumPR1/'
    filepath = os.path.join(output_folder, filename)
    with open(filepath, "w") as f:
        json.dump({"genes": genes}, f, indent=2)
    print(f"✅ Saved {filepath}")

In [2]:
# === Input Excel file ===
input_file = "/project/simmons_hts/kxu/hest/curated_gene_list.xlsx"

df = pd.read_excel(input_file)

full_panel_col = "480 panel full list"
cell_type_col = "Cell Type Specific"
conditional_col = "Conditional"
coeliac_col = 'Coeliac'
tcr_col = 'TCR'

# clean up a column into a list of genes
def extract_genes(series):
    genes = []
    for val in series.dropna():
        if isinstance(val, str):
            # Handle comma/semicolon-separated genes
            parts = [g.strip() for g in val.replace(";", ",").split(",") if g.strip()]
            genes.extend(parts)
        else:
            genes.append(str(val))
    return sorted(set(genes))

# Extract lists
full_panel_genes = extract_genes(df[full_panel_col])
cell_type_genes = extract_genes(df[cell_type_col])
conditional_genes = extract_genes(df[conditional_col])
coeliac_genes = extract_genes(df[coeliac_col])
tcr_genes = extract_genes(df[tcr_col])
print(len(full_panel_genes))
print(len(cell_type_genes))
print(len(conditional_genes))
print(len(coeliac_genes))
print(len(tcr_genes))

480
50
29
9
9


In [6]:
# save gene lists as gene json (not filtered for if they are present in xenium panel)
def save_json(filename_prefix, genes):
    filename = f"{filename_prefix}_genes.json"
    output_folder = '/project/simmons_hts/kxu/hest/gene_json/'
    filepath = os.path.join(output_folder, filename)
    with open(filepath, "w") as f:
        json.dump({"genes": genes}, f, indent=2)
    print(f"✅ Saved {filepath}")
    
save_json("full_panel", full_panel_genes)
save_json("cell_specific", cell_type_genes)
save_json("conditional", conditional_genes)
save_json("coeliac", coeliac_genes)
save_json("tcr", tcr_genes)

✅ Saved /project/simmons_hts/kxu/hest/gene_json/full_panel_genes.json
✅ Saved /project/simmons_hts/kxu/hest/gene_json/cell_specific_genes.json
✅ Saved /project/simmons_hts/kxu/hest/gene_json/conditional_genes.json
✅ Saved /project/simmons_hts/kxu/hest/gene_json/coeliac_genes.json
✅ Saved /project/simmons_hts/kxu/hest/gene_json/tcr_genes.json


# save all common genes as json

In [7]:
# collect all genes from HEST data (union of var_names across samples)
data_dir = "/project/simmons_hts/kxu/hest/eval/data/XeniumPR1/adata"  
adata_list = []
for fname in os.listdir(data_dir):
    if fname.endswith(".h5ad"):
        adata = sc.read_h5ad(os.path.join(data_dir, fname))
        adata_list.append(adata)
        
print(f"Loaded {len(adata_list)} samples")
print(f"First sample shape: {adata_list[0].shape}")

Loaded 15 samples
First sample shape: (22198, 9655)


In [8]:
print(f"First sample shape: {adata_list[1].shape}")
print(f"First sample shape: {adata_list[2].shape}")

adata_list[0].obs

First sample shape: (22173, 9655)
First sample shape: (23133, 9655)


Unnamed: 0,in_tissue,pxl_col_in_fullres,pxl_row_in_fullres,array_col,array_row,n_counts,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
000x185,True,10018.687481,140.926634,185,0,16,15,2.772589,16,2.833213,100.0,100.0,100.0,100.0
000x186,True,10247.491975,140.926634,186,0,2,2,1.098612,2,1.098612,100.0,100.0,100.0,100.0
000x187,True,10476.296469,140.926634,187,0,14,13,2.639057,14,2.708050,100.0,100.0,100.0,100.0
000x188,True,10705.100962,140.926634,188,0,6,6,1.945910,6,1.945910,100.0,100.0,100.0,100.0
000x189,True,10933.905456,140.926634,189,0,13,11,2.484907,13,2.639057,100.0,100.0,100.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123x006,True,-30937.316917,28283.879377,6,123,2,2,1.098612,2,1.098612,100.0,100.0,100.0,100.0
123x007,True,-30708.512423,28283.879377,7,123,3,3,1.386294,3,1.386294,100.0,100.0,100.0,100.0
123x008,True,-30479.707929,28283.879377,8,123,1,1,0.693147,1,0.693147,100.0,100.0,100.0,100.0
123x010,True,-30022.098942,28283.879377,10,123,1,1,0.693147,1,0.693147,100.0,100.0,100.0,100.0


In [10]:
# Get all feature names as a list
feature_names = adata_list[0].var_names.tolist()
sorted(feature_names, key=len, reverse = True)
filtered_features = [
    name for name in feature_names
    if "Codeword" not in name and "Intergenic_Region" not in name and "NegControl" not in name
]
print(len(filtered_features))
sorted(filtered_features, key=len, reverse = True)

5100


['TNFRSF13B',
 'SERPINA12',
 'TNFRSF10D',
 'HNRNPA1L2',
 'RAB11FIP5',
 'TNFRSF11A',
 'SECISBP2L',
 'TNFRSF10C',
 'CTTNBP2NL',
 'TNFRSF11B',
 'ADCYAP1R1',
 'TNFRSF13C',
 'TMPRSS11E',
 'EEF1AKNMT',
 'TNFRSF10A',
 'NIPSNAP3A',
 'EIF4ENIF1',
 'TNFRSF10B',
 'NOTCH2NLA',
 'TNFRSF12A',
 'EIF4EBP3',
 'SERPIND1',
 'SERPINA9',
 'EIF4EBP1',
 'SERPINB7',
 'SLC25A13',
 'SLC25A31',
 'SLC25A39',
 'TMPRSS13',
 'HSD17B10',
 'MAP1LC3A',
 'MAPK8IP3',
 'NUDT16L1',
 'SPATA5L1',
 'ARHGAP35',
 'PPARGC1A',
 'TNFRSF18',
 'SLC25A11',
 'SLC2A4RG',
 'TNFRSF1B',
 'TNFRSF6B',
 'PPP1R16B',
 'TRAF3IP2',
 'ANKRD13D',
 'GPATCH11',
 'MRFAP1L1',
 'RASGEF1C',
 'SMARCAL1',
 'TNFRSF21',
 'PPP1R15B',
 'TMEM185A',
 'SIGLEC11',
 'DNASE1L3',
 'ZMPSTE24',
 'CABCOCO1',
 'DEFB103B',
 'TMEM106B',
 'ADAMTS19',
 'MAPKAPK2',
 'NIPSNAP1',
 'PRICKLE1',
 'SERPINB5',
 'KIAA0408',
 'LEPROTL1',
 'MAP3K7CL',
 'MAPK8IP2',
 'SERPINA5',
 'SERPINH1',
 'SLC9A3R1',
 'SMARCAD1',
 'TNFRSF14',
 'TNFRSF1A',
 'TSNAXIP1',
 'APOBEC3B',
 'EPM2AIP1',
 'CSN

In [11]:
# use the function in hest library
def get_common_genes(
    adata_list: List[sc.AnnData], 
    save_dir: str = None, 
    min_cells_pct: float = 0 # keep all genes with no filtering of expression threshold 
) -> List[str]:
    """
    Get the common genes across all samples in adata_list.
    
    Args:
        adata_list (List[sc.AnnData]): list of scanpy AnnData objects
        save_dir (str, optional): if provided, save common genes as JSON file
        min_cells_pct (float): filter out genes expressed in less than 
                               min_cells_pct% of spots per slide
    
    Returns:
        List[str]: list of common genes across all datasets
    """

    logger = logging.getLogger(__name__)
    
    common_genes = None

    # Step 1: Get the intersection of genes across all adata
    for adata in adata_list:
        my_adata = adata.copy()
        
        if min_cells_pct:
            min_cells = int(np.ceil(min_cells_pct * len(my_adata.obs)))
            sc.pp.filter_genes(my_adata, min_cells=min_cells)
            print(f"Filtering genes: kept {my_adata.n_vars} genes with min_cells {min_cells}")

        curr_genes = np.array(my_adata.var_names)
        if common_genes is None:
            common_genes = curr_genes
        else:
            common_genes = np.intersect1d(common_genes, curr_genes)

    # Step 2: Remove unwanted control probes
    common_genes = [g for g in common_genes if 'BLANK' not in g and 'Control' not in g]

    logger.info(f"Found {len(common_genes)} common genes")

    # Step 3: Optionally save
    if save_dir is not None:
        json_dict = {"genes": list(common_genes)}
        with open(save_dir, "w") as json_file:
            json.dump(json_dict, json_file, indent=2)
        logger.info(f"Saved common genes to {save_dir}")

    return list(common_genes)


In [12]:
common_genes = get_common_genes(adata_list)
len(common_genes)

9006

In [14]:
# filter out meaningless genes
common_genes_filtered = [
    name for name in common_genes
    if "Codeword" not in name and "Intergenic_Region" not in name and "NegControl" not in name
]
len(common_genes_filtered)

5100

In [None]:
# save as json for prediction
save_json("all", common_genes_filtered)

# filter gene panel to only contain genes present in AnnData

In [20]:
print(len(full_panel_genes))
print(len(cell_type_genes))
print(len(conditional_genes))

# Intersection of common_genes with each list
full_panel_genes_present = list(set(common_genes) & set(full_panel_genes))
cell_type_genes_present = list(set(common_genes) & set(cell_type_genes))
conditional_genes_present = list(set(common_genes) & set(conditional_genes))

print(len(full_panel_genes_present))
print(len(cell_type_genes_present))
print(len(conditional_genes_present))

480
50
29
363
47
24


In [24]:
# check overlap
# Convert to sets
full_panel_set = set(full_panel_genes_present)
cell_type_set = set(cell_type_genes_present)
conditional_set = set(conditional_genes_present)

# Print sizes of each list
print("Full panel genes:", len(full_panel_set))
print("Cell type genes:", len(cell_type_set))
print("Conditional genes:", len(conditional_set))

# Overlap by two-way intersections
overlap_full_cell = full_panel_set & cell_type_set
overlap_full_cond = full_panel_set & conditional_set
overlap_cell_cond = cell_type_set & conditional_set

print("\nOverlap (Full panel & Cell type):", len(overlap_full_cell), overlap_full_cell)
print("Overlap (Full panel & Conditional):", len(overlap_full_cond), overlap_full_cond)
print("Overlap (Cell type & Conditional):", len(overlap_cell_cond), overlap_cell_cond)

Full panel genes: 363
Cell type genes: 47
Conditional genes: 24

Overlap (Full panel & Cell type): 47 {'ANGPTL4', 'CD4', 'PECAM1', 'BEST4', 'AICDA', 'GZMK', 'C3', 'RGS5', 'CD8A', 'OLFM4', 'ADAMDEC1', 'C7', 'ADAM28', 'CLEC9A', 'MADCAM1', 'CHGB', 'SELENBP1', 'F3', 'CD14', 'CCL21', 'LGR5', 'PROX1', 'SLC26A2', 'MUC2', 'SOX10', 'RET', 'SMOC2', 'CD19', 'S100B', 'CCL19', 'GZMA', 'CHGA', 'CD3D', 'EPCAM', 'CPA3', 'CXCL13', 'FOXP3', 'HEY1', 'DERL3', 'WNT5A', 'FRZB', 'LYVE1', 'CDX2', 'KIT', 'MS4A1', 'ASCL2', 'SLC26A3'}
Overlap (Full panel & Conditional): 24 {'SPP1', 'REG1A', 'IL6', 'MUC5B', 'DUOX2', 'IL26', 'LEFTY1', 'ISG20', 'MUC6', 'TNF', 'NOS2', 'IFNG', 'IL11', 'DEFA6', 'CXCL8', 'REG1B', 'ISG15', 'CXCL10', 'CXCL9', 'CCL25', 'DUOXA2', 'CXCL11', 'IDO1', 'LCN2'}
Overlap (Cell type & Conditional): 0 set()


Cell type genes and conditional genes are distinct

Both belong to full panel genes

In [None]:
# Save JSONs with correct naming
save_json("full_panel", full_panel_genes_present)
save_json("cell_specific", cell_type_genes_present)
save_json("conditional", conditional_genes_present)