In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import os
import sys

import mlflow
import numpy as np
import scanpy as sc
import squidpy as sq

from autotalker.data import load_spatial_adata_from_csv
from autotalker.models import Autotalker
from autotalker.utils import add_gps_from_gp_dict_to_adata
from autotalker.utils import extract_gp_dict_from_nichenet_ligand_target_mx
from autotalker.utils import extract_gp_dict_from_omnipath_lr_interactions

In [11]:
dataset = "squidpy_seqfish"

In [12]:
print(f"Using dataset {dataset}.")

if dataset == "deeplinc_seqfish":
    adata = load_spatial_adata_from_csv("datasets/seqFISH/counts.csv",
                                        "datasets/seqFISH/adj.csv")
    cell_type_key = None
elif dataset == "squidpy_seqfish":
    adata = sq.datasets.seqfish()
    sq.gr.spatial_neighbors(adata, radius = 0.04, coord_type="generic")
    cell_type_key = "celltype_mapped_refined"
elif dataset == "squidpy_slideseqv2":
    adata = sq.datasets.slideseqv2()
    sq.gr.spatial_neighbors(adata, radius = 30.0, coord_type="generic")
    cell_type_key = "cluster"

Using dataset squidpy_seqfish.


In [13]:
print(f"Number of nodes: {adata.X.shape[0]}")
print(f"Number of node features: {adata.X.shape[1]}")
avg_edges_per_node = round(
    adata.obsp['spatial_connectivities'].toarray().sum(axis=0).mean(),2)
print(f"Average number of edges per node: {avg_edges_per_node}")
n_edges = int(np.triu(adata.obsp['spatial_connectivities'].toarray()).sum())
print(f"Number of edges: {n_edges}", sep="")

Number of nodes: 19416
Number of node features: 351
Average number of edges per node: 4.4
Number of edges: 42694


In [None]:
os.makedirs("mlruns", exist_ok=True)

In [None]:
experiment = mlflow.set_experiment("autotalker")
mlflow.log_param("dataset", dataset)

In [None]:
node_label_method = "one-hop"
n_latent = 32

# Mask that allows all genes
if node_label_method == "self":
    n_output = len(adata.var)
elif node_label_method == "one-hop":
    n_output = len(adata.var) * 2
    
mask = np.ones((n_latent, n_output))
print(mask.shape)

In [None]:
model = Autotalker(adata,
                   adj_key="spatial_connectivities",
                   gp_mask_key="autotalker_gp_mask",
                   include_edge_recon_loss=False,
                   include_gene_expr_recon_loss=True,
                   node_label_method="one-hop",
                   n_hidden_encoder=32,
                   dropout_rate_encoder=0.,
                   dropout_rate_graph_decoder=0.,
                   gp_mask=mask)

In [None]:
model.train(n_epochs=20,
            lr=0.01,
            weight_decay=0,
            edge_val_ratio=0.1,
            edge_test_ratio=0.05,
            node_val_ratio=0.1,
            node_test_ratio=0.0,
            edge_batch_size=64,
            mlflow_experiment_id=experiment.experiment_id)

In [None]:
model.save(dir_path="./model_artefacts",
           overwrite=True,
           save_adata=True,
           adata_file_name="adata.h5ad")

In [None]:
model = Autotalker.load(dir_path="./model_artefacts",
                        adata=None,
                        adata_file_name="adata.h5ad")

In [None]:
latent = model.get_latent_representation()

In [None]:
latent_new_data = model.get_latent_representation(adata)

In [None]:
adata.obsm["latent_autotalker"] = latent_new_data

## Interoperability with scanpy

In [None]:
sc.set_figure_params(figsize=(6, 6))

# Use autotalker latent space for UMAP generation
sc.pp.neighbors(adata, use_rep="latent_autotalker")
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=[cell_type_key], frameon=False, save="_latent.png")

## Add Explainable Cell-Cell-Interaction Gene Programs (GPs)

In [24]:
gp_data_folder_path = "datasets/gp_data"
nichenet_ligand_target_mx_file_path = gp_data_folder_path + "/nichenet_ligand_target_matrix.csv"
omnipath_lr_interactions_file_path = gp_data_folder_path + "/omnipath_lr_interactions.csv"

os.makedirs(gp_data_folder_path, exist_ok=True)

### NicheNet

In [26]:
nichenet_gp_dict = extract_gp_dict_from_nichenet_ligand_target_mx(
    keep_target_ratio=0.1,
    load_from_disk=True,
    save_to_disk=True,
    file_path=nichenet_ligand_target_mx_file_path)

In [27]:
add_gps_from_gp_dict_to_adata(
    adata=adata,
    gp_dict=nichenet_gp_dict,
    genes_uppercase=True,
    gp_targets_varm_key="autotalker_gp_targets",
    gp_sources_varm_key="autotalker_gp_sources",
    gp_names_uns_key="autotalker_gp_names",
    min_genes_per_gp=0,
    max_genes_per_gp=None)

### OmniPath

In [41]:
omnipath_gp_dict = extract_gp_dict_from_omnipath_lr_interactions(
    min_curation_effort=0,
    load_from_disk=True,
    save_to_disk=True,
    file_path=omnipath_lr_interactions_file_path)

In [65]:
add_gps_from_gp_dict_to_adata(
    adata=adata,
    gp_dict=omnipath_gp_dict,
    genes_uppercase=True,
    gp_targets_varm_key="autotalker_gp_targets",
    gp_sources_varm_key="autotalker_gp_sources",
    gp_names_uns_key="autotalker_gp_names",
    min_genes_per_gp=0,
    max_genes_per_gp=None)

### NicheNet & OmniPath Combined

In [51]:
combined_gp_dict = dict(nichenet_gp_dict)
combined_gp_dict.update(omnipath_gp_dict)

In [67]:
add_gps_from_gp_dict_to_adata(
    adata=adata,
    gp_dict=combined_gp_dict,
    genes_uppercase=True,
    gp_targets_varm_key="autotalker_gp_targets",
    gp_sources_varm_key="autotalker_gp_sources",
    gp_names_uns_key="autotalker_gp_names",
    min_genes_per_gp=0,
    max_genes_per_gp=None)

In [68]:
adata.varm["autotalker_gp_targets"]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [69]:
adata.varm["autotalker_gp_sources"]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [70]:
adata.uns["autotalker_gp_names"][:5]

['CXCL1_ligand_targetgenes_GP',
 'CXCL2_ligand_targetgenes_GP',
 'CXCL3_ligand_targetgenes_GP',
 'CXCL5_ligand_targetgenes_GP',
 'PPBP_ligand_targetgenes_GP']

In [None]:
experiment = mlflow.set_experiment("autotalker_gp_masked")
mlflow.log_param("dataset", dataset)

In [None]:
gp_mask = adata.varm["autotalker_gps"].T
print(f"Gene program mask shape (gene programs x nodes): {gp_mask.shape}")
n_hidden_encoder = int(gp_mask.shape[0] / 2)
print(f"Number of hidden layers in the encoder: {n_hidden_encoder}")

In [None]:
model = Autotalker(adata,
                   autotalker_module="VGPGAE",
                   n_hidden_encoder=n_hidden_encoder,
                   dropout_rate_encoder=0.,
                   dropout_rate_graph_decoder=0.,
                   gp_mask=gp_mask)

In [None]:
model.train(n_epochs=20,
            lr=0.001,
            weight_decay=0,
            edge_val_ratio=0.1,
            edge_test_ratio=0.05,
            node_val_ratio=0.1,
            node_test_ratio=0.0,
            edge_batch_size=64,
            include_edge_recon_loss=True,
            include_gene_expr_recon_loss=True,
            mlflow_experiment_id=experiment.experiment_id)

The model can maintain the edge reconstruction performance while tweaking the latent space for better gene expression reconstruction.

In [None]:
latent_gps = model.get_latent_representation()

In [None]:
latent_gps_new_adata = model.get_latent_representation(adata)

In [None]:
adata.obsm["latent_gps_autotalker"] = latent_gps_new_adata

In [None]:
sc.set_figure_params(figsize=(6, 6))

# Use autotalker latent gp space for UMAP generation
sc.pp.neighbors(adata, use_rep="latent_gps_autotalker")
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=[cell_type_key], frameon=False)

In [None]:
gp_mask.shape

In [None]:
gene_programs = adata.uns["gene_programs"]
selected_gene_programs = ["CSF1", "IL34"]
selected_gene_programs_idx = [gene_programs.index(gene_program) for gene_program in selected_gene_programs]

In [None]:
latent_selected_gps = latent_gps[:, selected_gene_programs_idx]

In [None]:
adata.obs["CSF1"] = latent_selected_gps[:, 0]
adata.obs["IL34"] = latent_selected_gps[:, 1]

In [None]:
sc.pl.umap(adata, color="IL34", frameon=False, wspace=0.6)

In [None]:
sc.pl.scatter(adata, x="CSF1", y="IL34", color=cell_type_key, size=10)

## SCVI

In [None]:
import scvi
import scanpy as sc
import matplotlib.pyplot as plt

sc.set_figure_params(figsize=(6, 6))

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

In [None]:
adata.layers["counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata # freeze the state in `.raw`

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts"
)

In [None]:
model = scvi.model.SCVI(adata)

In [None]:
model

In [None]:
model.train()

In [None]:
latent_scvi = model.get_latent_representation()

In [None]:
adata.obsm["X_scVI"] = latent_scvi

In [None]:
# run PCA then generate UMAP plots
sc.tl.pca(adata)
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata, min_dist=0.3)

In [None]:
sc.pl.umap(adata, color=["celltype_mapped_refined"], frameon=False,)