# BANKSY

- **Creator**: Anamika Yadav (anamika310.yadav@gmail.com)
- **Date of Creation:** 12.07.2024
- **Date of Last Modification:** 22.07.2024 (Sebastian Birk; <sebastian.birk@helmholtz-munich.de>)

- The Banksy source code is available at https://github.com/prabhakarlab/Banksy (R) and https://github.com/prabhakarlab/Banksy_py (Python).
- The corresponding publication is "Singhal, V. et al. BANKSY unifies cell typing and tissue domain segmentation for scalable spatial omics data analysis. Nat. Genet. (2024) doi:10.1038/s41588-024-01664-3".
- The workflow of this notebook follows the tutorial from https://github.com/prabhakarlab/Banksy_py/blob/main/slideseqv2_analysis.ipynb.

- Run this notebook in the nichecompass-reproducibility environment, installable from ```('../../../envs/environment.yaml')```. In addition, it is required to clone the Banksy_py repo from GitHub as follows:
    - ```cd analysis/benchmarking```
    - ```git clone https://github.com/prabhakarlab/Banksy_py.git```

## 1. Setup

### 1.1 Import Libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("../Banksy_py")

In [None]:
import os
import time
from datetime import datetime

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse as sp
import squidpy as sq
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

from banksy.initialize_banksy import initialize_banksy
from banksy_utils.load_data import load_adata, display_adata
from banksy_utils.filter_utils import filter_cells, filter_hvg, normalize_total, print_max_min
from banksy_utils.umap_pca import pca_umap
from banksy.embed_banksy import generate_banksy_matrix

### 1.2 Define Parameters

In [None]:
model_name = "banksy"
latent_key = f"{model_name}_latent"
lambda_list = [0.2]  # list of lambda parameters

### 1.3 Run Notebook Setup

In [None]:
sc.set_figure_params(figsize=(6, 6))
now = datetime.now()
current_timestamp = now.strftime("%d%m%Y_%H%M%S")

### 1.4 Configure Paths and Directories


In [None]:
data_folder_path = "../../../datasets/st_data/gold/"
benchmarking_folder_path = "../../../artifacts/single_sample_method_benchmarking"
figure_folder_path = f"../../../figures"
# figure_folder_path = f"../figures/"

## 2. Banksy Model


### 2.1 Define Training Function


In [None]:
def train_banksy_models(dataset,
                        cell_type_key,
                        niche_type_key=None,
                        adata_new=None,
                        n_start_run=1,
                        n_end_run=8,
                        n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16],
                        gp_inference=False):
    # Configure figure folder path
    dataset_figure_folder_path = f"{figure_folder_path}/{dataset}/single_sample_method_benchmarking/" \
                                 f"{model_name}/{current_timestamp}"
    os.makedirs(dataset_figure_folder_path, exist_ok=True)

    # Create new adata to store results from training runs in storage-efficient way
    if adata_new is None:
        adata_original = sc.read_h5ad(data_folder_path + f"{dataset}.h5ad")
        adata_new = sc.AnnData(sp.csr_matrix(
            (adata_original.shape[0], adata_original.shape[1]),
            dtype=np.float32))
        adata_new.var_names = adata_original.var_names
        adata_new.obs_names = adata_original.obs_names
        adata_new.obs["cell_type"] = adata_original.obs[cell_type_key].values
        if niche_type_key in adata_original.obs.columns:
            adata_new.obs["niche_type"] = adata_original.obs[niche_type_key].values
        adata_new.obsm["spatial"] = adata_original.obsm["spatial"]
        del(adata_original)

    model_seeds = list(range(10))
    for run_number, n_neighbors in zip(np.arange(n_start_run, n_end_run+1), n_neighbor_list):
        # n_neighbors is here used for k_geom parameter in banksy method as well as the latent neighbor graph construction used for
        # UMAP generation and clustering 

        # Load data
        adata = sc.read_h5ad(data_folder_path + f"{dataset}.h5ad")

        start_time = time.time()

        # Set default model hyperparams
        max_m = 1 # use both mean and AFT
        nbr_weight_decay = "scaled_gaussian" # can also choose "reciprocal", "uniform" or "ranked"
        lambda_list = [0.8]
        pca_dims = [20]
        
        # Define spatial coordinates
        adata.obs["spatial_x"] = adata.obsm['spatial'][:, 0]
        adata.obs["spatial_y"] = adata.obsm['spatial'][:, 1]

        banksy_dict = initialize_banksy(
            adata,
            ("spatial_x", "spatial_y", "spatial"),
            n_neighbors,
            nbr_weight_decay=nbr_weight_decay,
            max_m=max_m,
            plt_edge_hist=False,
            plt_nbr_weights=False,
            plt_agf_angles=False, # takes long time to plot
            plt_theta=False)

        banksy_dict, banksy_matrix = generate_banksy_matrix(
            adata,
            banksy_dict,
            lambda_list,
            max_m)

        pca_umap(
            banksy_dict,
            pca_dims = pca_dims,
            add_umap = True,
            plt_remaining_var = False)

        adata.obsm[latent_key] = banksy_dict[nbr_weight_decay][lambda_list[0]]["adata"].obsm["reduced_pc_20"]

        # Measure time for model training
        end_time = time.time()
        elapsed_time = end_time - start_time
        hours, rem = divmod(elapsed_time, 3600)
        minutes, seconds = divmod(rem, 60)
        print(f"Duration of model training in run {run_number}: "
              f"{int(hours)} hours, {int(minutes)} minutes and {int(seconds)} seconds.")
        adata_new.uns[f"{model_name}_model_training_duration_run{run_number}"] = (
            elapsed_time)

        # Store latent representation
        adata_new.obsm[latent_key + f"_run{run_number}"] = adata.obsm[latent_key]

        # Store intermediate adata to disk
        if gp_inference:
            adata_new.write(f"{benchmarking_folder_path}/{dataset}_{model_name}_gpinference.h5ad")
        else:
            adata_new.write(f"{benchmarking_folder_path}/{dataset}_{model_name}.h5ad")  

    # Store final adata to disk
    if gp_inference:
        adata_new.write(f"{benchmarking_folder_path}/{dataset}_{model_name}_gpinference.h5ad")
    else:
        adata_new.write(f"{benchmarking_folder_path}/{dataset}_{model_name}.h5ad")  

2.2 Train Models on Benchmarking Datasets


In [None]:
train_banksy_models(dataset="seqfish_mouse_organogenesis_embryo2",
                    cell_type_key="celltype_mapped_refined",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_banksy_models(dataset=f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct_embryo2",
                        cell_type_key="celltype_mapped_refined",
                        adata_new=None,
                        n_start_run=1,
                        n_end_run=8,
                        n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_banksy_models(dataset="nanostring_cosmx_human_nsclc_batch5",
                    cell_type_key="cell_type",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_banksy_models(dataset=f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_batch5",
                        cell_type_key="cell_type",
                        adata_new=None,
                        n_start_run=1,
                        n_end_run=8,
                        n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_banksy_models(dataset="vizgen_merfish_mouse_liver",
                    cell_type_key="Cell_Type",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_banksy_models(dataset=f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct",
                        cell_type_key="Cell_Type",
                        adata_new=None,
                        n_start_run=1,
                        n_end_run=8,
                        n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_banksy_models(dataset="slideseqv2_mouse_hippocampus",
                    cell_type_key="cell_type",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_banksy_models(dataset=f"slideseqv2_mouse_hippocampus_subsample_{subsample_pct}pct",
                        cell_type_key="cell_type",
                        adata_new=None,
                        n_start_run=1,
                        n_end_run=8,
                        n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_banksy_models(dataset="sim1_1105genes_10000locs_strongincrements",
                    cell_type_key="cell_types",
                    niche_type_key="niche_types",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_banksy_models(dataset="sim1_1105genes_10000locs_strongincrements",
                    cell_type_key="cell_types",
                    niche_type_key="niche_types",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[6, 6, 6, 6, 6, 6, 6, 6],
                    gp_inference=True)

In [None]:
train_banksy_models(dataset="starmap_mouse_mpfc",
                    cell_type_key="cell_type",
                    niche_type_key="niche_type",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_banksy_models(dataset="stereoseq_mouse_embryo",
                    cell_type_key="leiden",
                    niche_type_key="niche_type",
                    adata_new=None,
                    n_start_run=1,
                    n_end_run=8,
                    n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])