# CellCharter

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>)
- **Date of Creation:** 11.03.2024
- **Date of Last Modification:** 22.07.2024

- The CellCharter source code is available at https://github.com/CSOgroup/cellcharter.
- The corresponding publication is "Varrone, M., Tavernari, D., Santamaria-Martínez, A., Walsh, L. A. & Ciriello, G. CellCharter reveals spatial cell niches associated with tissue remodeling and cell plasticity. Nat. Genet. 56, 74–84 (2024)".
- The workflow of this notebook follows the tutorial from https://cellcharter.readthedocs.io/en/latest/notebooks/cosmx_human_nsclc.html.

Run this notebook in the cellcharter environment, installable from ```('../../../envs/environment_cellcharter.yaml')```. The required PyTorch version only supports CUDA capabilities sm_37 sm_50 sm_60 sm_70, so it won't work on new GPUs.

## 1. Setup

### 1.1 Import Libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import time
from datetime import datetime

import anndata as ad
import cellcharter as cc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse as sp
import squidpy as sq

### 1.2 Define Parameters

In [None]:
model_name = "cellcharter"
latent_key = f"{model_name}_latent"

### 1.3 Run Notebook Setup

In [None]:
sc.set_figure_params(figsize=(6, 6))

In [None]:
# Get time of notebook execution for timestamping saved artifacts
now = datetime.now()
current_timestamp = now.strftime("%d%m%Y_%H%M%S")

### 1.4 Configure Paths and Directories

In [None]:
data_folder_path = "../../../datasets/st_data/gold/"
benchmarking_folder_path = "../../../artifacts/single_sample_method_benchmarking"
figure_folder_path = f"../../../figures"

## 2. CellCharter Model

### 2.1 Define Training Function

In [None]:
def train_cellcharter_models(dataset,
                             cell_type_key,
                             niche_type_key=None,
                             adata_new=None,
                             n_start_run=1,
                             n_end_run=8,
                             n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16]):
    # Configure figure folder path
    dataset_figure_folder_path = f"{figure_folder_path}/{dataset}/single_sample_method_benchmarking/" \
                                 f"{model_name}/{current_timestamp}"
    os.makedirs(dataset_figure_folder_path, exist_ok=True)
    
    # Create new adata to store results from training runs in storage-efficient way
    if adata_new is None:
        adata_original = sc.read_h5ad(data_folder_path + f"{dataset}.h5ad")
        adata_new = sc.AnnData(sp.csr_matrix(
            (adata_original.shape[0], adata_original.shape[1]),
            dtype=np.float32))
        adata_new.var_names = adata_original.var_names
        adata_new.obs_names = adata_original.obs_names
        adata_new.obs["cell_type"] = adata_original.obs[cell_type_key].values
        if niche_type_key in adata_original.obs.columns:
            adata_new.obs["niche_type"] = adata_original.obs[niche_type_key].values
        adata_new.obsm["spatial"] = adata_original.obsm["spatial"]
        del(adata_original)
    
    model_seeds = list(range(10))
    for run_number, n_neighbors in zip(np.arange(n_start_run, n_end_run+1), n_neighbor_list):
        # n_neighbors is here only used for the latent neighbor graph construction used for
        # UMAP generation and clustering as scVI is not a spatial method
        
        # Load trained SCVI results
        adata = sc.read_h5ad(f"{benchmarking_folder_path}/{dataset}_scvi.h5ad")

        start_time = time.time()

        adata.obsm["latent_scvi"] = adata.obsm[f"scvi_latent_run{run_number}"].astype(np.float32)

        # Compute spatial neighborhood graph
        sq.gr.spatial_neighbors(adata,
                                coord_type="generic",
                                spatial_key="spatial",
                                n_neighs=n_neighbors)

        # Use CellCharter as per tutorial
        cc.gr.remove_long_links(adata)
        cc.gr.aggregate_neighbors(adata, n_layers=3, use_rep="latent_scvi", out_key=latent_key)
        
        # Measure time for model training
        end_time = time.time()
        elapsed_time = end_time - start_time
        hours, rem = divmod(elapsed_time, 3600)
        minutes, seconds = divmod(rem, 60)
        print(f"Duration of model training in run {run_number}: "
              f"{int(hours)} hours, {int(minutes)} minutes and {int(seconds)} seconds.")
        adata_new.uns[f"{model_name}_model_training_duration_run{run_number}"] = (
            adata.uns[f"scvi_model_training_duration_run{run_number}"] + elapsed_time)
        
        # Store latent representation
        adata_new.obsm[latent_key + f"_run{run_number}"] = adata.obsm[latent_key]

        # Store intermediate adata to disk
        adata_new.write(f"{benchmarking_folder_path}/{dataset}_{model_name}.h5ad")  

    # Store final adata to disk
    adata_new.write(f"{benchmarking_folder_path}/{dataset}_{model_name}.h5ad") 

### 2.2 Train Models on Benchmarking Datasets

In [None]:
train_cellcharter_models(dataset="seqfish_mouse_organogenesis_embryo2",
                         cell_type_key="celltype_mapped_refined",
                         adata_new=None,
                         n_start_run=1,
                         n_end_run=8,
                         n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_cellcharter_models(dataset=f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct_embryo2",
                             cell_type_key="celltype_mapped_refined",
                             adata_new=None,
                             n_start_run=1,
                             n_end_run=8,
                             n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_cellcharter_models(dataset="nanostring_cosmx_human_nsclc_batch5",
                         cell_type_key="cell_type",
                         adata_new=None,
                         n_start_run=1,
                         n_end_run=8,
                         n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_cellcharter_models(dataset=f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_batch5",
                             cell_type_key="cell_type",
                             adata_new=None,
                             n_start_run=1,
                             n_end_run=8,
                             n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_cellcharter_models(dataset="vizgen_merfish_mouse_liver",
                         cell_type_key="Cell_Type",
                         adata_new=None,
                         n_start_run=1,
                         n_end_run=8,
                         n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_cellcharter_models(dataset=f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct",
                             cell_type_key="Cell_Type",
                             adata_new=None,
                             n_start_run=1,
                             n_end_run=8,
                             n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_cellcharter_models(dataset="slideseqv2_mouse_hippocampus",
                         cell_type_key="cell_type",
                         adata_new=None,
                         n_start_run=1,
                         n_end_run=8,
                         n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    train_cellcharter_models(dataset=f"slideseqv2_mouse_hippocampus_subsample_{subsample_pct}pct",
                             cell_type_key="cell_type",
                             adata_new=None,
                             n_start_run=1,
                             n_end_run=8,
                             n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_cellcharter_models(dataset="sim1_1105genes_10000locs_strongincrements",
                         cell_type_key="cell_types",
                         niche_type_key="niche_types",
                         adata_new=None,
                         n_start_run=1,
                         n_end_run=8,
                         n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_cellcharter_models(dataset="starmap_mouse_mpfc",
                         cell_type_key="cell_type",
                         niche_type_key="niche_type",
                         adata_new=None,
                         n_start_run=1,
                         n_end_run=8,
                         n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])

In [None]:
train_cellcharter_models(dataset="stereoseq_mouse_embryo",
                         cell_type_key="leiden",
                         niche_type_key="niche_type",
                         adata_new=None,
                         n_start_run=1,
                         n_end_run=8,
                         n_neighbor_list=[4, 4, 8, 8, 12, 12, 16, 16])