# NicheCompass Data Analysis

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of Computational Biology (ICB), Talavera-López Lab
- **Date of Creation:** 22.01.2023
- **Date of Last Modification:** 31.08.2023

- In order to run this notebook, a trained model needs to be stored under f"../artifacts/{dataset}/models/{model_label}/{load_timestamp}".

## 1. Setup

### 1.1 Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../utils")

In [3]:
import argparse
import gc
import os
import random
import shutil
import warnings
from datetime import datetime
from matplotlib import rcParams

import anndata as ad
import matplotlib
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse as sp
import scipy.stats as stats
import seaborn as sns
import squidpy as sq
import torch
from matplotlib import gridspec
from matplotlib.pyplot import rc_context
from sklearn.preprocessing import MinMaxScaler
from pywaffle import Waffle

from nichecompass.models import NicheCompass
from nichecompass.utils import (add_gps_from_gp_dict_to_adata,
                                aggregate_obsp_matrix_per_cell_type,
                                create_cell_type_chord_plot_from_df,
                                create_new_color_dict,
                                generate_enriched_gp_info_plots)

from analysis_utils import (add_cell_type_latent_cluster_emphasis,
                            add_sub_cell_type,
                            compute_cell_type_latent_clusters,
                            generate_gp_info_plots,
                            plot_physical_latent_for_cell_types,
                            plot_cell_type_latent_clusters,
                            plot_latent,
                            plot_category_in_latent_and_physical_space,
                            sankey,
                            store_top_gps_summary)

### 1.2 Define Parameters

In [4]:
dataset = "nanostring_cosmx_human_nsclc"

#### 1.2.1 Generic Parameters

In [5]:
## Model
# AnnData keys
adj_key = "spatial_connectivities"
spatial_key = "spatial"
sub_cell_type_key = "cell_type_original"
nicke_key = "niche"
gp_names_key = "nichecompass_gp_names"
active_gp_names_key = "nichecompass_active_gp_names"
latent_key = "nichecompass_latent"
mapping_entity_key = "mapping_entity"

## Analysis
differential_gp_test_results_key = "nichecompass_differential_gp_test_results"

## Others
random_seed = 0

#### 1.2.2 Dataset-specific Parameters

In [6]:
multimodal = False
log_norm_omics_features = False
cell_type_groups = []
latent_groups = []

load_timestamp = "03092023_001459_4"
model_label = "reference_query_mapping"
latent_leiden_resolution = 0.5
latent_cluster_spot_size = 0.03
dataset_str = "nanoString CosMx Human NSCLC"
condition_key = "batch"
sample_key = "batch"
spot_size = 30
cell_type_key = "cell_type"
    
latent_cluster_key = f"latent_leiden_{str(latent_leiden_resolution)}"

### 1.3 Run Notebook Setup

In [7]:
sc.set_figure_params(figsize=(6, 6))
sns.set_style("whitegrid", {'axes.grid' : False})

In [8]:
# Ignore future warnings and user warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

### 1.4 Configure Paths and Create Directories

In [9]:
# Define paths
figure_folder_path = f"../artifacts/{dataset}/figures/{model_label}/{load_timestamp}"
model_folder_path = f"../artifacts/{dataset}/models/{model_label}/{load_timestamp}"
result_folder_path = f"../artifacts/{dataset}/results/{model_label}/{load_timestamp}"
gp_data_folder_path = "../datasets/gp_data" # gene program data
srt_data_folder_path = "../datasets/srt_data" # spatially resolved transcriptomics data
srt_data_gold_folder_path = f"{srt_data_folder_path}/gold"

# Create required directories
os.makedirs(figure_folder_path, exist_ok=True)
os.makedirs(result_folder_path, exist_ok=True)

## 2. Model

### 2.1 Load Model

In [10]:
model_folder_path

'../artifacts/nanostring_cosmx_human_nsclc/models/reference_query_mapping/03092023_001459_4'

In [None]:
model = NicheCompass.load(dir_path=model_folder_path,
                          adata=None,
                          adata_file_name=f"{dataset}_{model_label}.h5ad",
                          gp_names_key=gp_names_key)

In [None]:
model = NicheCompass.load(dir_path=model_folder_path,
                          adata=None,
                          adata_file_name=f"{dataset}_{model_label}_postprocessed.h5ad",
                          gp_names_key=gp_names_key)

In [None]:
samples = model.adata.obs[sample_key].unique().tolist()

In [None]:
model.adata

## 3. Compute Leiden clusters

In [None]:
sc.tl.leiden(adata=model.adata,
             resolution=latent_leiden_resolution,
             key_added=latent_cluster_key,
             neighbors_key=latent_key)

In [None]:
model.adata

In [None]:
model.adata.obs.to_csv(f"{model_folder_path}/{dataset}_{model_label}_leiden.csv")

In [None]:
rcParams['figure.figsize']=(6,4)
sc.pl.umap(model.adata, color=['mapping_entity','latent_leiden_0.5','cell_type_original','batch','cell_type','niche'], ncols=3, wspace=0.5)

## 4. Get GPs

In [None]:
# Check number of active gene programs
model.adata.uns[active_gp_names_key] = model.get_active_gps()
print(f"Number of total gene programs: {len(model.adata.uns[gp_names_key])}")
print(f"Number of active gene programs: {len(model.adata.uns[active_gp_names_key])}")

In [None]:
gp_summary_df = model.get_gp_summary()
gp_summary_df.to_csv(f"{model_folder_path}/{dataset}_{model_label}_gp_summary.csv")
gp_summary_df[gp_summary_df["gp_active"]][0:5]

In [None]:
model.add_active_gp_scores_to_obs()

In [None]:
model.adata.write(f"{model_folder_path}/{dataset}_{model_label}_postprocessed.h5ad")

In [None]:
gc.collect()

## 4. Differntial GPs

In [11]:
model = NicheCompass.load(dir_path=model_folder_path,
                          adata=None,
                          adata_file_name=f"{dataset}_{model_label}_postprocessed.h5ad",
                          gp_names_key=gp_names_key)
gc.collect()

--- INITIALIZING NEW NETWORK MODULE: VARIATIONAL GENE PROGRAM GRAPH AUTOENCODER ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True, rna_recon_loss: nb
NODE LABEL METHOD -> one-hop-norm
ACTIVE GP THRESHOLD RATIO -> 0.03
LOG VARIATIONAL -> True
CATEGORICAL COVARIATES EMBEDDINGS INJECTION -> ['gene_expr_decoder']
ONE HOP GCN NORM RNA NODE LABEL AGGREGATOR
ENCODER -> n_input: 960, n_cat_covariates_embed_input: 0, n_hidden: 960, n_latent: 1494, n_addon_latent: 100, n_fc_layers: 1, n_layers: 1, conv_layer: gatv2conv, n_attention_heads: 4, dropout_rate: 0.0, use_bn: False
COSINE SIM GRAPH DECODER -> dropout_rate: 0.0
MASKED TARGET RNA DECODER -> n_prior_gp_input: 1494, n_addon_gp_input: 100, n_cat_covariates_embed_input: 38, n_output: 960
MASKED SOURCE RNA DECODER -> n_prior_gp_input: 1494, n_addon_gp_input: 100, n_cat_covariates_embed_input: 38, n_output: 960


77416

In [None]:
compare = {
    'tumor_clusters': ['2','4','5','9','10','11','12'],
    'stroma_clusters': ['0','1','13','14'],
    'neutrophil_clusters': ['6','8'],
    'macrophage_clusters': ['7','15','13']
}
res = {}
# Run differential gp testing
log_bayes_factor_thresh = 2.3 # 2.3 strong threshold; 4.6 decisive threshold (https://en.wikipedia.org/wiki/Bayes_factor)

for structure, clusters in compare.items():
    print(structure)
    for cl in clusters:
        print(cl)
        enriched_gps = model.run_differential_gp_tests(
            cat_key=latent_cluster_key,
            selected_cats = [cl],
            comparison_cats=[x for x in clusters if x != cl],
            log_bayes_factor_thresh=log_bayes_factor_thresh)
        res[f'{structure}_{cl}'] = model.adata.uns['nichecompass_differential_gp_test_results'][model.adata.uns['nichecompass_differential_gp_test_results']['p_h1']<0.05]
        res[f'{structure}_{cl}'].to_csv(f'../artifacts/nanostring_cosmx_human_nsclc/results/reference_query_mapping/03092023_001459_4/gpTest_{structure}_{cl}.csv')

tumor_clusters
2
4
5
9


In [None]:
selected_cats = None
log_bayes_factor_thresh = 2.3 # 2.3 strong threshold; 4.6 decisive threshold (https://en.wikipedia.org/wiki/Bayes_factor)
title = f"NicheCompass Latent Cluster Enriched Gene Programs Log Bayes Factor {log_bayes_factor_thresh}"
save_fig = True
file_path = f"{figure_folder_path}/res_{latent_leiden_resolution}_" \
            f"latent_clusters_all_vs_rest_log_bayes_factor_" \
            f"{log_bayes_factor_thresh}_enriched_gps_heatmap.pdf"

# Run differential gp testing
enriched_gps = model.run_differential_gp_tests(
    cat_key=latent_cluster_key,
    selected_cats=selected_cats,
    comparison_cats="rest",
    log_bayes_factor_thresh=log_bayes_factor_thresh)

In [None]:
# Plot heatmap of enriched gps
ax = sc.pl.heatmap(model.adata,
                    enriched_gps,
                    show_gene_labels=True,
                    groupby=latent_cluster_key,
                    dendrogram=True,
                    swap_axes=True,
                    figsize=(model.adata.obs[latent_cluster_key].nunique() * 1.5,
                             len(enriched_gps) / 2),
                    save=save_fig)
if save_fig:
    shutil.move("figures/heatmap.pdf", file_path)
    os.rmdir("figures")

In [None]:
save_file = True
file_path = f"{figure_folder_path}/res_{latent_leiden_resolution}_" \
            f"latent_clusters_all_vs_rest_log_bayes_factor_" \
            f"{log_bayes_factor_thresh}_enriched_gps_summary.csv"

gp_summary_cols = ["gp_name",
                   "n_source_genes",
                   "n_non_zero_source_genes",
                   "n_target_genes",
                   "n_non_zero_target_genes",
                   "gp_source_genes",
                   "gp_target_genes",
                   "gp_source_genes_weights",
                   "gp_target_genes_weights",
                   "gp_source_genes_importances",
                   "gp_target_genes_importances"]
if multimodal:
    gp_summary_cols = gp_summary_cols + [
        "n_source_peaks",
        "n_target_peaks",
        "gp_source_peaks",
        "gp_target_peaks",
        "gp_source_peaks_weights",
        "gp_target_peaks_weights",
        "gp_source_peaks_importances",
        "gp_target_peaks_importances"]

# Get summary of decisively enriched gene programs
enriched_gp_summary_df = gp_summary_df[gp_summary_df["gp_name"].isin(enriched_gps)]
cat_dtype = pd.CategoricalDtype(categories=enriched_gps, ordered=True)
enriched_gp_summary_df["gp_name"] = enriched_gp_summary_df["gp_name"].astype(cat_dtype)
enriched_gp_summary_df = enriched_gp_summary_df.sort_values(by="gp_name")
enriched_gp_summary_df = enriched_gp_summary_df[gp_summary_cols]

if save_file:
    enriched_gp_summary_df.to_csv(f"{file_path}")
else:
    display(enriched_gp_summary_df)

In [None]:
save_figs = True

for i in np.arange(0, len(enriched_gps), 10):
    plot_label = f"res_{latent_leiden_resolution}_" \
                 f"latent_clusters_all_vs_rest_log_bayes_factor_" \
                 f"{log_bayes_factor_thresh}"

    generate_enriched_gp_info_plots(
        plot_label=plot_label,
        model=model,
        sample_key=sample_key,
        differential_gp_test_results_key=differential_gp_test_results_key,
        cat_key=latent_cluster_key,
        cat_palette=latent_cluster_colors,
        n_top_enriched_gp_start_idx=i,
        n_top_enriched_gp_end_idx=i+10,
        feature_spaces=samples, # ["latent"]
        n_top_genes_per_gp=5,
        n_top_peaks_per_gp=(0 if not multimodal else 5),
        scale_omics_ft=False,
        save_figs=save_figs,
        figure_folder_path=f"{figure_folder_path}/",
        spot_size=spot_size)
    
    gc.collect()

In [None]:
model.adata.uns['nichecompass_differential_gp_test_results'][model.adata.uns['nichecompass_differential_gp_test_results']['p_h1']<0.05]

In [None]:
# Run differential gp testing
log_bayes_factor_thresh = 2.3 # 2.3 strong threshold; 4.6 decisive threshold (https://en.wikipedia.org/wiki/Bayes_factor)

enriched_gps = model.run_differential_gp_tests(
    cat_key=latent_cluster_key,
    selected_cats = ["10"],
    comparison_cats=["4","5","9"],
    log_bayes_factor_thresh=log_bayes_factor_thresh)

In [None]:
model.adata.uns['nichecompass_differential_gp_test_results'][model.adata.uns['nichecompass_differential_gp_test_results']['p_h1']<0.05]

In [None]:
plot_latent(model.adata, color_by='SPP1_ligand_receptor_GP', dataset_label='', color_palette='RdBu_r',save_fig=False, file_path='here', groups=latent_cluster_key)

In [None]:
adata=model.adata,
        plot_label="Mapping Entities",
        cat_key=mapping_entity_key,
        groups=None,
        sample_key=sample_key,
        samples=samples,
        cat_colors=mapping_entity_colors,
        size=(720000 / len(model.adata)),
        spot_size=spot_size,
        save_fig=save_fig,
        file_path=file_path)

In [None]:
# Run differential gp testing
log_bayes_factor_thresh = 2.3 # 2.3 strong threshold; 4.6 decisive threshold (https://en.wikipedia.org/wiki/Bayes_factor)

enriched_gps = model.run_differential_gp_tests(
    cat_key=latent_cluster_key,
    selected_cats = ["10"],
    comparison_cats=["4","5","9"],
    log_bayes_factor_thresh=log_bayes_factor_thresh)

In [None]:
# Run differential gp testing
log_bayes_factor_thresh = 2.3 # 2.3 strong threshold; 4.6 decisive threshold (https://en.wikipedia.org/wiki/Bayes_factor)

enriched_gps = model.run_differential_gp_tests(
    cat_key=latent_cluster_key,
    selected_cats = ["10"],
    comparison_cats=["4","5","9"],
    log_bayes_factor_thresh=log_bayes_factor_thresh)

In [None]:
# Load trained model
# model = NicheCompass.load(dir_path=model_folder_path,
#                           adata=None,
#                           adata_file_name=f"{dataset}_{model_label}_postprocessed.h5ad",
#                           gp_names_key=gp_names_key)