# Autotalker Tutorial

## 1. Setup

### 1.1 Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import os
import sys
from datetime import datetime

import mlflow
import numpy as np
import scanpy as sc
import squidpy as sq
import torch

from autotalker.data import load_spatial_adata_from_csv
from autotalker.models import Autotalker
from autotalker.utils import (add_gps_from_gp_dict_to_adata,
                              extract_gp_dict_from_nichenet_ligand_target_mx,
                              extract_gp_dict_from_omnipath_lr_interactions)

### 1.2 Configure Paths and Create Directories

In [3]:
# Mlflow
os.makedirs("mlruns", exist_ok=True)

# Gene programs
gp_data_folder_path = "datasets/gp_data"
nichenet_ligand_target_mx_file_path = gp_data_folder_path + "/nichenet_ligand_target_matrix.csv"
omnipath_lr_interactions_file_path = gp_data_folder_path + "/omnipath_lr_interactions.csv"
os.makedirs(gp_data_folder_path, exist_ok=True)

### 1.3 Define Parameters

In [4]:
dataset = "squidpy_seqfish"
# node_label_method = "self"
node_label_method = "one-hop-norm"
# node_label_method = "one-hop-sum"
n_latent = 512
edge_batch_size = 32

## 2. Load Data

In [5]:
print(f"Using dataset {dataset}.")

if dataset == "deeplinc_seqfish":
    adata = load_spatial_adata_from_csv(x_file_path="datasets/seqFISH/counts.csv",
                                        adj_file_path="datasets/seqFISH/adj.csv",
                                        cell_type_file_path="datasets/seqFISH/cell_types.csv",
                                        cell_type_col="Cell_class_name")
    cell_type_key = "cell_type"
elif dataset == "squidpy_seqfish":
    adata = sq.datasets.seqfish()
    sq.gr.spatial_neighbors(adata, radius = 0.04, coord_type="generic")
    cell_type_key = "celltype_mapped_refined"
elif dataset == "squidpy_slideseqv2":
    adata = sq.datasets.slideseqv2()
    sq.gr.spatial_neighbors(adata, radius = 30.0, coord_type="generic")
    cell_type_key = "cluster"
    
adata.layers["counts"] = adata.X.copy()

Using dataset squidpy_seqfish.


In [6]:
print(f"Number of nodes: {adata.layers['counts'].shape[0]}")
print(f"Number of node features: {adata.layers['counts'].shape[1]}")
avg_edges_per_node = round(
    adata.obsp['spatial_connectivities'].toarray().sum(axis=0).mean(),2)
print(f"Average number of edges per node: {avg_edges_per_node}")
n_edges = int(np.triu(adata.obsp['spatial_connectivities'].toarray()).sum())
print(f"Number of edges: {n_edges}", sep="")

Number of nodes: 19416
Number of node features: 351
Average number of edges per node: 4.4
Number of edges: 42694


## 3. Autotalker Model with Fully Connected Gene Program Mask

### 3.1 Create Fully Connected Gene Program Mask

In [7]:
# Mask that allows all genes
if node_label_method == "self":
    n_output = len(adata.var)
    gp_targets_mask = np.ones((n_latent, n_output))
    print(f"gp_targets_mask shape: {gp_targets_mask.shape}")
elif node_label_method != "self":
    n_output = len(adata.var) * 2
    gp_targets_mask = np.ones((n_latent, int(n_output / 2)))
    gp_sources_mask = np.ones((n_latent, int(n_output / 2)))
    print(f"gp_targets_mask shape: {gp_targets_mask.shape}")
    print(f"gp_sources_mask shape: {gp_sources_mask.shape}")

gp_targets_mask shape: (512, 351)
gp_sources_mask shape: (512, 351)


### 3.2 Initialize, Train & Save Model

In [8]:
model = Autotalker(adata,
                   counts_layer_key="counts",
                   adj_key="spatial_connectivities",
                   gp_targets_mask_key="autotalker_gp_targets",
                   gp_sources_mask_key="autotalker_gp_sources",
                   include_edge_recon_loss=True,
                   include_gene_expr_recon_loss=True,
                   log_variational=True,
                   node_label_method=node_label_method,
                   n_hidden_encoder=int(n_latent/2),
                   dropout_rate_encoder=0.,
                   dropout_rate_graph_decoder=0.,
                   gp_targets_mask=gp_targets_mask,
                   gp_sources_mask=(None if node_label_method == "self" else gp_sources_mask))

--- INITIALIZING NEW NETWORK MODULE: VGPGAE ---
LOSS -> include_edge_recon_loss: True, include_gene_expr_recon_loss: True
GCN ENCODER -> n_input: 351, n_hidden: 256, n_latent: 512, dropout_rate: 0.0
DOT PRODUCT GRAPH DECODER -> dropout_rate: 0.0
MASKED GENE EXPRESSION DECODER -> n_input: 512, n_output: 702


In [None]:
experiment = mlflow.set_experiment("autotalker_fc_gps")
mlflow.log_param("dataset", dataset)

model.train(n_epochs=30,
            lr=0.01,
            weight_decay=0,
            edge_val_ratio=0.1,
            edge_test_ratio=0.05,
            node_val_ratio=0.1,
            edge_batch_size=edge_batch_size,
            mlflow_experiment_id=experiment.experiment_id)

--- INITIALIZING TRAINER ---
Number of training nodes: 17474
Number of validation nodes: 1942
Number of training edges: 36291
Number of validation edges: 4269
Number of test edges: 2134

--- MODEL TRAINING ---
Epoch 1/30 |--------------------| 3.3% val_auroc_score: 0.9562; val_auprc_score: 0.9410; val_best_acc_score: 0.8990; val_best_f1_score: 0.9044; train_loss: 553.5989; train_edge_recon_loss: 1.2210; train_kl_loss: 0.7042; train_gene_expr_recon_loss: 551.6737; val_loss: 542.1354; val_edge_recon_loss: 0.6222; val_kl_loss: 0.7874; val_gene_expr_recon_loss: 540.7259
Epoch 2/30 |█-------------------| 6.7% val_auroc_score: 0.9601; val_auprc_score: 0.9464; val_best_acc_score: 0.9065; val_best_f1_score: 0.9106; train_loss: 539.8421; train_edge_recon_loss: 0.5570; train_kl_loss: 0.8317; train_gene_expr_recon_loss: 538.4533; val_loss: 542.8443; val_edge_recon_loss: 0.5348; val_kl_loss: 0.8858; val_gene_expr_recon_loss: 541.4237
Epoch 3/30 |██------------------| 10.0% val_auroc_score: 0.9606;

In [None]:
model.save(dir_path="./model_artefacts",
           overwrite=True,
           save_adata=True,
           adata_file_name="adata.h5ad")

In [None]:
model = Autotalker.load(dir_path="./model_artefacts",
                        adata=None,
                        adata_file_name="adata.h5ad")

### 3.3 Retrieve Latent Gene Programs

In [None]:
latent = model.get_latent_representation()

In [None]:
latent_new_data = model.get_latent_representation(adata)

In [None]:
adata.obsm["latent_autotalker"] = latent_new_data

## 6. Visualize Latent Gene Programs with scanpy

In [None]:
now = datetime.now()
current_time = now.strftime("%d%m%Y_%H%M%S")

sc.set_figure_params(figsize=(6, 6))

# Use autotalker latent space for UMAP generation
sc.pp.neighbors(adata, use_rep="latent_autotalker")
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=[cell_type_key], frameon=False, save=f"_latent_{current_time}.png")

## 7. Add Explainable Cell-Cell-Interaction (CCI) Gene Programs (GPs)

### 7.1 NicheNet CCI GPs

In [None]:
nichenet_gp_dict = extract_gp_dict_from_nichenet_ligand_target_mx(
    keep_target_ratio=0.1,
    load_from_disk=True,
    save_to_disk=True,
    file_path=nichenet_ligand_target_mx_file_path)

In [None]:
add_gps_from_gp_dict_to_adata(
    adata=adata,
    gp_dict=nichenet_gp_dict,
    genes_uppercase=True,
    gp_targets_varm_key="autotalker_gp_targets",
    gp_sources_varm_key="autotalker_gp_sources",
    gp_names_uns_key="autotalker_gp_names",
    min_genes_per_gp=0,
    max_genes_per_gp=None)

In [None]:
model = Autotalker(adata,
                   adj_key="spatial_connectivities",
                   gp_targets_mask_key="autotalker_gp_targets",
                   gp_sources_mask_key="autotalker_gp_sources",
                   include_edge_recon_loss=True,
                   include_gene_expr_recon_loss=True,
                   node_label_method=node_label_method,
                   n_hidden_encoder=int(n_latent/2),
                   dropout_rate_encoder=0.,
                   dropout_rate_graph_decoder=0.,
                   gp_targets_mask=None,
                   gp_sources_mask=None)

In [None]:
experiment = mlflow.set_experiment("autotalker_nichenet_gps")
mlflow.log_param("dataset", dataset)

model.train(n_epochs=30,
            lr=0.1,
            weight_decay=0,
            edge_val_ratio=0.1,
            edge_test_ratio=0.05,
            node_val_ratio=0.1,
            edge_batch_size=64,
            mlflow_experiment_id=experiment.experiment_id)

In [None]:
latent = model.get_latent_representation()
adata.obsm["latent_autotalker_nichenet_gps"] = latent

now = datetime.now()
current_time = now.strftime("%d%m%Y_%H%M%S")

sc.set_figure_params(figsize=(6, 6))

# Use autotalker latent space for UMAP generation
sc.pp.neighbors(adata, use_rep="latent_autotalker_nichenet_gps")
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=[cell_type_key], frameon=False, save=f"_latent_nichenet_gps_{current_time}.png")

### OmniPath

In [None]:
omnipath_gp_dict = extract_gp_dict_from_omnipath_lr_interactions(
    min_curation_effort=0,
    load_from_disk=True,
    save_to_disk=True,
    file_path=omnipath_lr_interactions_file_path)

In [None]:
add_gps_from_gp_dict_to_adata(
    adata=adata,
    gp_dict=omnipath_gp_dict,
    genes_uppercase=True,
    gp_targets_varm_key="autotalker_gp_targets",
    gp_sources_varm_key="autotalker_gp_sources",
    gp_names_uns_key="autotalker_gp_names",
    min_genes_per_gp=0,
    max_genes_per_gp=None)

### 7.3 NicheNet & OmniPath CCI GPs Combined

In [None]:
combined_gp_dict = dict(nichenet_gp_dict)
combined_gp_dict.update(omnipath_gp_dict)

In [None]:
add_gps_from_gp_dict_to_adata(
    adata=adata,
    gp_dict=combined_gp_dict,
    genes_uppercase=True,
    gp_targets_varm_key="autotalker_gp_targets",
    gp_sources_varm_key="autotalker_gp_sources",
    gp_names_uns_key="autotalker_gp_names",
    min_genes_per_gp=0,
    max_genes_per_gp=None)

In [None]:
adata.varm["autotalker_gp_targets"]

In [None]:
adata.varm["autotalker_gp_sources"]

In [None]:
adata.uns["autotalker_gp_names"][:5]

In [None]:
gp_targets_mask = torch.tensor(adata.varm["autotalker_gp_targets"].T, dtype=torch.float32)
gp_sources_mask = torch.tensor(adata.varm["autotalker_gp_sources"].T, dtype=torch.float32)
gp_mask = torch.cat((gp_targets_mask, gp_sources_mask), dim=1)
print(f"Gene program mask shape (gene programs x nodes): {gp_mask.shape}")
n_hidden_encoder = int(gp_mask.shape[0] / 2)
print(f"Number of hidden layers in the encoder: {n_hidden_encoder}")

In [None]:
model = Autotalker(adata,
                   adj_key="spatial_connectivities",
                   gp_targets_mask_key="autotalker_gp_targets",
                   gp_sources_mask_key="autotalker_gp_sources",
                   include_edge_recon_loss=True,
                   include_gene_expr_recon_loss=True,
                   node_label_method=node_label_method,
                   n_hidden_encoder=n_hidden_encoder,
                   dropout_rate_encoder=0.,
                   dropout_rate_graph_decoder=0.,
                   gp_targets_mask=None,
                   gp_sources_mask=None)

In [None]:
experiment = mlflow.set_experiment("autotalker_combined_gps")
mlflow.log_param("dataset", dataset)

model.train(n_epochs=30,
            lr=0.1,
            weight_decay=0,
            edge_val_ratio=0.1,
            edge_test_ratio=0.05,
            node_val_ratio=0.1,
            edge_batch_size=64,
            mlflow_experiment_id=experiment.experiment_id)

In [None]:
latent = model.get_latent_representation()
adata.obsm["latent_autotalker_combined_gps"] = latent

now = datetime.now()
current_time = now.strftime("%d%m%Y_%H%M%S")

sc.set_figure_params(figsize=(6, 6))

# Use autotalker latent space for UMAP generation
sc.pp.neighbors(adata, use_rep="latent_autotalker_nichenet_gps")
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=[cell_type_key], frameon=False, save=f"_latent_autotalker_combined_gps_{current_time}.png")

The model can maintain the edge reconstruction performance while tweaking the latent space for better gene expression reconstruction.

In [None]:
latent_gps = model.get_latent_representation()

In [None]:
latent_gps_new_adata = model.get_latent_representation(adata)

In [None]:
adata.obsm["latent_gps_autotalker"] = latent_gps_new_adata

In [None]:
sc.set_figure_params(figsize=(6, 6))

# Use autotalker latent gp space for UMAP generation
sc.pp.neighbors(adata, use_rep="latent_gps_autotalker")
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=[cell_type_key], frameon=False)

In [None]:
gp_mask.shape

In [None]:
gene_programs = adata.uns["gene_programs"]
selected_gene_programs = ["CSF1", "IL34"]
selected_gene_programs_idx = [gene_programs.index(gene_program) for gene_program in selected_gene_programs]

In [None]:
latent_selected_gps = latent_gps[:, selected_gene_programs_idx]

In [None]:
adata.obs["CSF1"] = latent_selected_gps[:, 0]
adata.obs["IL34"] = latent_selected_gps[:, 1]

In [None]:
sc.pl.umap(adata, color="IL34", frameon=False, wspace=0.6)

In [None]:
sc.pl.scatter(adata, x="CSF1", y="IL34", color=cell_type_key, size=10)

## SCVI

In [None]:
import scvi
import scanpy as sc
import matplotlib.pyplot as plt

sc.set_figure_params(figsize=(6, 6))

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

In [None]:
adata.layers["counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata # freeze the state in `.raw`

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts"
)

In [None]:
model = scvi.model.SCVI(adata)

In [None]:
model

In [None]:
model.train()

In [None]:
latent_scvi = model.get_latent_representation()

In [None]:
adata.obsm["X_scVI"] = latent_scvi

In [None]:
# run PCA then generate UMAP plots
sc.tl.pca(adata)
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata, min_dist=0.3)

In [None]:
sc.pl.umap(adata, color=["celltype_mapped_refined"], frameon=False,)