# Simulation Data Gene Program Inference with BANKSY

- **Creator**: Sebastian Birk (sebastian.birk@helmholtz-munich.de)
- **Date of Creation:** 12.07.2024
- **Date of Last Modification:** 13.08.2024

- Perform gene set enrichment analysis with Decoupler as described at https://www.sc-best-practices.org/conditions/gsea_pathway.html#cluster-level-gene-set-enrichment-analysis-with-decoupler

## 1. Setup

### 1.1 Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
import decoupler
import liana as li
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.metrics import f1_score
from nichecompass.models import NicheCompass
from nichecompass.utils import (extract_gp_dict_from_mebocost_ms_interactions,
                                extract_gp_dict_from_nichenet_lrt_interactions,
                                extract_gp_dict_from_omnipath_lr_interactions,
                                filter_and_combine_gp_dict_gps_v2)

### 1.2 Define Parameters

In [23]:
species = "mouse"
sample_key = "niche_type"
groupby = "cell_type"
differential_gp_test_results_key = "nichecompass_differential_gp_test_results"
gp_names_key = "nichecompass_gp_names"

### 1.3 Run Notebook Setup

In [5]:
sc.set_figure_params(figsize=(6, 6))

### 1.4 Configure Paths and Directories

In [6]:
ga_data_folder_path = "../../../datasets/ga_data"
gp_data_folder_path = "../../../datasets/gp_data"
st_data_folder_path = "../../../datasets/st_data"
st_data_gold_folder_path = st_data_folder_path + "/gold"
omnipath_lr_network_file_path = f"{gp_data_folder_path}/omnipath_lr_network.csv"
nichenet_lr_network_file_path = f"{gp_data_folder_path}/nichenet_lr_network_v2_{species}.csv"
nichenet_ligand_target_matrix_file_path = f"{gp_data_folder_path}/nichenet_ligand_target_matrix_v2_{species}.csv"
mebocost_enzyme_sensor_interactions_folder_path = f"{gp_data_folder_path}/metabolite_enzyme_sensor_gps"
gene_orthologs_mapping_file_path = f"{ga_data_folder_path}/human_mouse_gene_orthologs.csv"
artifacts_folder_path = f"../../../artifacts"

### 1.5 Define Functions

In [7]:
def compute_f1(set1, set2):
    # Convert sets to binary vectors
    all_elements = list(set1.union(set2))
    vec1 = [1 if elem in set1 else 0 for elem in all_elements]
    vec2 = [1 if elem in set2 else 0 for elem in all_elements]
    
    # Compute F1 score
    return f1_score(vec1, vec2)

## 2. GP Inference


### 2.1 Load Cell Embeddings from BANKSY



In [10]:
# Get embeddings from BANKSY
adata_annotated_banksy = sc.read_h5ad(f"{artifacts_folder_path}/single_sample_method_benchmarking/sim1_1105genes_10000locs_strongincrements_banksy.h5ad")

# Get raw gene expression counts
adata = sc.read_h5ad(f"{st_data_gold_folder_path}/sim1_1105genes_10000locs_strongincrements.h5ad")

for run_number in range(1,9):
    adata.obsm[f"banksy_latent_run{run_number}"] = adata_annotated_banksy.obsm[f"banksy_latent_run{run_number}"]
del(adata_annotated_banksy)

# Preprocess counts
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

### 2.2 Get NicheCompass GPs for Use as Prior Knowledge

In [11]:
# Retrieve OmniPath GPs (source: ligand genes; target: receptor genes)
omnipath_gp_dict = extract_gp_dict_from_omnipath_lr_interactions(
    species=species,
    load_from_disk=True,
    save_to_disk=False,
    lr_network_file_path=omnipath_lr_network_file_path,
    gene_orthologs_mapping_file_path=gene_orthologs_mapping_file_path,
    plot_gp_gene_count_distributions=False)

# Retrieve NicheNet GPs (source: ligand genes; target: receptor genes, target genes)
nichenet_gp_dict = extract_gp_dict_from_nichenet_lrt_interactions(
    species=species,
    version="v2",
    load_from_disk=True,
    save_to_disk=False,
    lr_network_file_path=nichenet_lr_network_file_path,
    ligand_target_matrix_file_path=nichenet_ligand_target_matrix_file_path,
    gene_orthologs_mapping_file_path=gene_orthologs_mapping_file_path,
    plot_gp_gene_count_distributions=False)

# Retrieve MEBOCOST GPs (source: enzyme genes; target: sensor genes)
mebocost_gp_dict = extract_gp_dict_from_mebocost_ms_interactions(
    dir_path=mebocost_enzyme_sensor_interactions_folder_path,
    species=species,
    plot_gp_gene_count_distributions=False)

# Filter and combine GPs
gp_dicts = [omnipath_gp_dict, nichenet_gp_dict, mebocost_gp_dict]
combined_gp_dict = filter_and_combine_gp_dict_gps_v2(
    gp_dicts,
    verbose=True)

combined_gp_df = pd.DataFrame(combined_gp_dict.values(), index=combined_gp_dict.keys())

# Filter genes to keep only genes in adata
adata_genes = adata.var.index.tolist()
combined_gp_df["sources"] = combined_gp_df["sources"].apply(lambda x: [gene for gene in x if gene in adata_genes])
combined_gp_df["targets"] = combined_gp_df["targets"].apply(lambda x: [gene for gene in x if gene in adata_genes])



Combining ACE2_ligand_receptor_GP and Ace2_ligand_receptor_target_gene_GP.
Combining ACE2_ligand_receptor_GP and L-Phenylalanine_metabolite_enzyme_sensor_GP.
Combining ADCYAP1_ligand_receptor_GP and Adcyap1_ligand_receptor_target_gene_GP.
Combining ADGRE5_ligand_receptor_GP and Adgre5_ligand_receptor_target_gene_GP.
Combining ADIPOQ_ligand_receptor_GP and Adipoq_ligand_receptor_target_gene_GP.
Combining ADM_ligand_receptor_GP and Adm_ligand_receptor_target_gene_GP.
Combining ADM2_ligand_receptor_GP and Adm2_ligand_receptor_target_gene_GP.
Combining AGRP_ligand_receptor_GP and Agrp_ligand_receptor_target_gene_GP.
Combining AGT_ligand_receptor_GP and Agt_ligand_receptor_target_gene_GP.
Combining ALCAM_ligand_receptor_GP and Alcam_ligand_receptor_target_gene_GP.
Combining ALKAL1_ligand_receptor_GP and Alkal1_ligand_receptor_target_gene_GP.
Combining ALKAL2_ligand_receptor_GP and Alkal2_ligand_receptor_target_gene_GP.
Combining AMBN_ligand_receptor_GP and Ambn_ligand_receptor_target_gene_G

### 2.3 Get Simulation Ground Truth GPs

In [12]:
# Get GP ground truth stats
increment_mode = "strong"
sim_gps_stats_file_path = f"sim_gps_filtered_stats_{increment_mode}increments.csv"
sim_gps_stats = pd.read_csv(f"{gp_data_folder_path}/data_simulation/{sim_gps_stats_file_path}")
sim_gps_stats = sim_gps_stats.applymap(lambda x: "" if pd.isna(x) else x)
sim_gps_stats["prior_gps"] = sim_gps_stats["prior_gps"].apply(
    lambda x: [gp for gp in x.split(",")])
sim_gps_stats = sim_gps_stats[sim_gps_stats["increment_param"] != 1]
sim_gps_stats = sim_gps_stats.groupby("regionA")["prior_gps"].sum().reset_index()
sim_gps_stats["prior_gps"] = sim_gps_stats["prior_gps"].apply(lambda x: {item for item in x if item not in [None, '']})
sim_gps_stats.rename(columns={'regionA': 'niche'}, inplace=True)



### 2.4 NicheCompass Metrics

In [59]:
# Get differentially expressed genes
f1_scores = []

for run_number in range(1,9):
    print(f"Starting run {run_number}...")
    latent_key = f"nichecompass_latent"
    
    load_timestamp = "26072024_145319_1"
    
    model_label = "gatv2conv_single_sample_method_benchmarking"
    model_folder_path = f"{artifacts_folder_path}/sim1_1105genes_10000locs_strongincrements/models/{model_label}/{load_timestamp}/run{run_number}"

    model = NicheCompass.load(dir_path=model_folder_path,
                              adata=None,
                              adata_file_name=f"sim1_1105genes_10000locs_strongincrements_{model_label}.h5ad",
                              gp_names_key=gp_names_key) 
    
    sc.pp.neighbors(model.adata,
                    use_rep=latent_key,
                    key_added=latent_key)
    sc.tl.umap(model.adata,
               neighbors_key=latent_key)
    
    # Compute Leiden clustering of latent space until 8 niches are obtained (to match ground truth number)
    print(f"Computing Leiden clusters...")
    latent_leiden_resolution = 0.3
    leiden_resolution_increments = 0.1
    latent_cluster_key = f"latent_leiden_{str(latent_leiden_resolution)}"
    counter = 0
    while True:
        sc.tl.leiden(adata=model.adata,
                     resolution=latent_leiden_resolution,
                     key_added="pred_niche_types",
                     neighbors_key=latent_key)
        
        niche_counts = adata.obs["pred_niche_types"].value_counts()
        valid_niches = niche_counts[niche_counts >= 100].index
        n_niches = adata.obs[adata.obs["pred_niche_types"].isin(valid_niches)]["pred_niche_types"].nunique()
        print(f"Current number of niches: {n_niches}")
        print(f"Cluster counter: {counter}")
        if n_niches == 8:
            break
        elif n_niches < 7 and counter < 30:
            print("Big increase of clustering resolution...")
            latent_leiden_resolution += leiden_resolution_increments
        elif n_niches < 8 and counter < 60:
            print("Slight increase of clustering resolution...")
            latent_leiden_resolution += leiden_resolution_increments/10
        elif n_niches > 9 and counter < 30:
            print("Big decrease of clustering resolution...")
            latent_leiden_resolution -= leiden_resolution_increments
        elif n_niches > 8 and counter < 60:
            print("Slight decrease of clustering resolution...")
            latent_leiden_resolution -= leiden_resolution_increments/10
        elif counter > 60:
            break
        counter += 1
        
    cross_tab = pd.crosstab(model.adata.obs["pred_niche_types"], model.adata.obs["niche_types"])
    majority_map = cross_tab.idxmax(axis=1)
    model.adata.obs["mapped_pred_niche_types"] = model.adata.obs["pred_niche_types"].map(majority_map)
    
    print(f"Computing enriched GPs...")
    selected_cats = None
    comparison_cats = "rest"
    log_bayes_factor_thresh = 2.3
    enriched_gps = model.run_differential_gp_tests(
        cat_key="mapped_pred_niche_types",
        selected_cats=selected_cats,
        comparison_cats=comparison_cats,
        log_bayes_factor_thresh=log_bayes_factor_thresh)

    # Get enriched prior and de novo GPs
    gp_summary_df = model.get_gp_summary() 
    enriched_gps = model.adata.uns[differential_gp_test_results_key]
    prior_enriched_gps = enriched_gps[~enriched_gps["gene_program"].str.contains("Add-on")]  
    enriched_prior_gps_per_niche = prior_enriched_gps.groupby(
                "category")[["gene_program", "log_bayes_factor"]].agg(list)

    enriched_prior_gps_per_niche["niche"] = enriched_prior_gps_per_niche.index.tolist()
    niche_enriched_df = enriched_prior_gps_per_niche[["niche", "gene_program"]]
    niche_enriched_df["enriched_gps"] = niche_enriched_df["gene_program"].apply(lambda x: set(x))

    # Compute F1 scores
    merged_df = pd.merge(niche_enriched_df, sim_gps_stats, on='niche', suffixes=('_df1', '_df2'))
    merged_df['f1_score'] = merged_df.apply(lambda row: compute_f1(row['enriched_gps'], row['prior_gps']), axis=1)
    f1_scores.append(np.mean(merged_df["f1_score"]))
    
f1_score_df = pd.DataFrame(f1_scores, columns=["f1_score"])
f1_score_df["run_number"] = range(1,9)
f1_score_df.to_csv("nichecompass_f1_scores.csv", index=False)

Starting run 1...
--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 




COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




Starting run 2...




--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




Starting run 3...




--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




Starting run 4...




--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




Starting run 5...




--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




Starting run 6...




--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




Starting run 7...




--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




Starting run 8...




--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.0
LOG VARIATIONAL -> True
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 1105, n_cat_covariates_embed_input: 0, n_hidden: 1105, n_latent: 1452, n_addon_latent: 10, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, 
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1452, n_addon_gp_input: 10, n_cat_covariates_embed_input: 0, n_output: 1105
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing enriched GPs...




In [60]:
f1_score_df

Unnamed: 0,f1_score,run_number
0,0.064958,1
1,0.056861,2
2,0.097866,3
3,0.100066,4
4,0.122538,5
5,0.122261,6
6,0.139038,7
7,0.13551,8


### 2.4 Decoupler GSEA

#### 2.4.1 Perform Gene Set Enrichment Analysis with Decoupler (Based on Target Genes)

In [66]:
# Format GPs for GSEA
gp_df_gsea = combined_gp_df.copy()
gp_df_gsea["geneset"] = gp_df_gsea.index
gp_df_gsea["genesymbol"] = gp_df_gsea["targets"]
del(gp_df_gsea["targets"])
gp_df_gsea = gp_df_gsea[["geneset", "genesymbol"]]
gp_df_gsea.reset_index(drop=True, inplace=True)
gp_df_gsea = gp_df_gsea.explode("genesymbol").reset_index(drop=True)
gp_df_gsea = gp_df_gsea.dropna(subset=['genesymbol'])
gp_df_gsea = gp_df_gsea.drop_duplicates()

In [75]:
# Get differentially expressed genes
f1_scores = []

for run_number in range(1,9):
    print(f"Starting run {run_number}...")
    latent_key = f"banksy_latent_run{run_number}"
    sc.pp.neighbors(adata,
                    use_rep=latent_key,
                    key_added=latent_key)
    sc.tl.umap(adata,
               neighbors_key=latent_key)
    
    # Compute Leiden clustering of latent space until 8 niches are obtained (to match ground truth number)
    print(f"Computing Leiden clusters...")
    latent_leiden_resolution = 0.3
    leiden_resolution_increments = 0.1
    latent_cluster_key = f"latent_leiden_{str(latent_leiden_resolution)}"
    counter = 0
    while True:
        sc.tl.leiden(adata=adata,
                     resolution=latent_leiden_resolution,
                     key_added="pred_niche_types",
                     neighbors_key=latent_key)
        
        niche_counts = adata.obs["pred_niche_types"].value_counts()
        valid_niches = niche_counts[niche_counts >= 100].index
        n_niches = adata.obs[adata.obs["pred_niche_types"].isin(valid_niches)]["pred_niche_types"].nunique()
        print(f"Current number of niches: {n_niches}")
        print(f"Cluster counter: {counter}")
        if n_niches == 8:
            break
        elif n_niches < 7 and counter < 30:
            print("Big increase of clustering resolution...")
            latent_leiden_resolution += leiden_resolution_increments
        elif n_niches < 8 and counter < 60:
            print("Slight increase of clustering resolution...")
            latent_leiden_resolution += leiden_resolution_increments/10
        elif n_niches > 9 and counter < 30:
            print("Big decrease of clustering resolution...")
            latent_leiden_resolution -= leiden_resolution_increments
        elif n_niches > 8 and counter < 60:
            print("Slight decrease of clustering resolution...")
            latent_leiden_resolution -= leiden_resolution_increments/10
        elif counter > 60:
            break
        counter += 1
        
    cross_tab = pd.crosstab(adata.obs["pred_niche_types"], adata.obs["niche_types"])
    majority_map = cross_tab.idxmax(axis=1)
    adata.obs["mapped_pred_niche_types"] = adata.obs["pred_niche_types"].map(majority_map)
    
    print(f"Computing GSEA...")
    sc.tl.rank_genes_groups(adata, "mapped_pred_niche_types", method="t-test", key_added="t-test")
    
    # Get enriched GPs per niche
    enriched_dict = {}

    for niche_type in adata.obs["mapped_pred_niche_types"].unique().tolist():

        # Extract differential gene expression scores
        t_stats = (
            # Get dataframe of DE results for condition vs. rest
            sc.get.rank_genes_groups_df(adata, niche_type, key="t-test")
            .set_index("names")
            # Sort by absolute score
            .sort_values("scores", key=np.abs, ascending=False)
            # Format for decoupler
            [["scores"]]
            .rename_axis([niche_type], axis=1)
        )

        # Get gene set enrichment analysis results
        scores, norm, pvals = decoupler.run_gsea(
            t_stats.T,
            gp_df_gsea,
            min_n=0,
            source="geneset",
            target="genesymbol",
        )

        gsea_results = (
            pd.concat({"score": scores.T, "norm": norm.T, "pval": pvals.T}, axis=1)
            .droplevel(level=1, axis=1)
            .sort_values("pval")
        )

        # Keep upregulated pathways with p-value < 0.05
        enriched_gps = gsea_results[gsea_results["score"] > 0][gsea_results["pval"] < 0.05].index.tolist()
        enriched_dict[niche_type] = enriched_gps
        
    enriched_df = pd.DataFrame(list(enriched_dict.items()), columns=["niche", "enriched_gps"])
    niche_enriched_df = enriched_df.groupby("niche")["enriched_gps"].sum().reset_index()
    niche_enriched_df["enriched_gps"] = niche_enriched_df["enriched_gps"].apply(lambda x: set(x))

    # Compute F1 scores
    merged_df = pd.merge(niche_enriched_df, sim_gps_stats, on='niche', suffixes=('_df1', '_df2'))
    merged_df['f1_score'] = merged_df.apply(lambda row: compute_f1(row['enriched_gps'], row['prior_gps']), axis=1)
    f1_scores.append(np.mean(merged_df["f1_score"]))
    
f1_score_df = pd.DataFrame(f1_scores, columns=["f1_score"])
f1_score_df["run_number"] = range(1,9)
f1_score_df.to_csv("banksy_gsea_f1_scores.csv", index=False)

Starting run 1...
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing GSEA...




Starting run 2...
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing GSEA...




Starting run 3...
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing GSEA...




Starting run 4...
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing GSEA...




Starting run 5...
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing GSEA...




Starting run 6...
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing GSEA...




Starting run 7...
Computing Leiden clusters...
Current number of niches: 8
Cluster counter: 0
Computing GSEA...




Starting run 8...
Computing Leiden clusters...
Current number of niches: 9
Cluster counter: 0
Slight decrease of clustering resolution...
Current number of niches: 9
Cluster counter: 1
Slight decrease of clustering resolution...
Current number of niches: 9
Cluster counter: 2
Slight decrease of clustering resolution...
Current number of niches: 9
Cluster counter: 3
Slight decrease of clustering resolution...
Current number of niches: 8
Cluster counter: 4
Computing GSEA...




In [76]:
f1_score_df

Unnamed: 0,f1_score,run_number
0,0.005083,1
1,0.018298,2
2,0.005266,3
3,0.017535,4
4,0.032993,5
5,0.015353,6
6,0.023901,7
7,0.001932,8


### 2.5 LIANA CCI

In [None]:
li.rs.show_resources()

In [None]:
li.mt.rank_aggregate.by_sample(
    adata,
    groupby=groupby,
    resource_name='mouseconsensus',
    sample_key=sample_key,
    use_raw=False,
    verbose=True,
    n_perms=None,
    return_all_lrs=True,
    )

In [None]:
adata.var_names

In [None]:
adata.uns["liana_res"].sort_values("magnitude_rank").head(10)