In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import os
import sys

import mlflow
import numpy as np
import scanpy as sc
import squidpy as sq

from autotalker.data import load_spatial_adata_from_csv
from autotalker.models import Autotalker
from autotalker.utils import download_nichenet_ligand_target_mx
from autotalker.utils import extract_gps_from_ligand_target_mx
from autotalker.utils import mask_adata_with_gp_dict

In [3]:
dataset = "deeplinc_seqfish"

In [4]:
print(f"Using dataset {dataset}.")

if dataset == "deeplinc_seqfish":
    adata = load_spatial_adata_from_csv("datasets/seqFISH/counts.csv",
                                        "datasets/seqFISH/adj.csv")
    cell_type_key = None
elif dataset == "squidpy_seqfish":
    adata = sq.datasets.seqfish()
    sq.gr.spatial_neighbors(adata, radius = 0.04, coord_type="generic")
    cell_type_key = "celltype_mapped_refined"
elif dataset == "squidpy_slideseqv2":
    adata = sq.datasets.slideseqv2()
    sq.gr.spatial_neighbors(adata, radius = 30.0, coord_type="generic")
    cell_type_key = "cluster"

Using dataset deeplinc_seqfish.


In [5]:
print(f"Number of nodes: {adata.X.shape[0]}")
print(f"Number of node features: {adata.X.shape[1]}")
avg_edges_per_node = round(
    adata.obsp['spatial_connectivities'].toarray().sum(axis=0).mean(),2)
print(f"Average number of edges per node: {avg_edges_per_node}")
n_edges = int(np.triu(adata.obsp['spatial_connectivities'].toarray()).sum())
print(f"Number of edges: {n_edges}", sep="")

Number of nodes: 1597
Number of node features: 125
Average number of edges per node: 3.59
Number of edges: 2863


In [6]:
os.makedirs("mlruns", exist_ok=True)

In [7]:
experiment = mlflow.set_experiment("autotalker")
mlflow.log_param("dataset", dataset)

In [8]:
# Mask that allows all genes
mask = np.ones((16, len(adata.var)))

In [9]:
model = Autotalker(adata,
                   autotalker_module="VGPGAE",
                   n_hidden_encoder=32,
                   dropout_rate_encoder=0.,
                   dropout_rate_graph_decoder=0.,
                   gp_mask=mask)

--- INITIALIZING NEW NETWORK MODULE: VGPGAE ---
GCN ENCODER -> n_input: 125, n_hidden: 32, n_latent: 16, dropout_rate: 0.0
DOT PRODUCT GRAPH DECODER -> dropout_rate: 0.0
MASKED GENE EXPRESSION DECODER -> n_input: 16, n_output: 125


In [10]:
model.train(n_epochs=5,
            lr=0.01,
            weight_decay=0,
            edge_val_ratio=0.1,
            edge_test_ratio=0.05,
            node_val_ratio=0.1,
            node_test_ratio=0.0,
            edge_batch_size=64,
            include_edge_recon_loss=True,
            include_gene_expr_recon_loss=False,
            mlflow_experiment_id=experiment.experiment_id)

--- INITIALIZING TRAINER ---
Number of training nodes: 1437
Number of validation nodes: 160
Number of test nodes: 0
Number of training edges: 2434
Number of validation edges: 286
Number of test edges: 143
--- MODEL TRAINING ---
 |████████████████████| 100.0%  - val_auroc_score: 0.8645471661 - val_auprc_score: 0.8598313430 - val_best_acc_score: 0.7954545455 - val_best_f1_score: 0.7931034483 - train_loss: 0.2731847171 - train_edge_recon_loss: 0.2462116912 - train_kl_loss: 0.0269730246 - train_gene_expr_recon_loss:     nan - val_loss: 0.2934200048 - val_edge_recon_loss: 0.2723386377 - val_kl_loss: 0.0210813668 - val_gene_expr_recon_loss:     nan
Model training finished after 0 min 2 sec.
Saving best model state, which was in epoch 2.
--- MODEL EVALUATION ---
Test AUROC score: 0.8717
Test AUPRC score: 0.8586
Test best acc score: 0.8077
Test best f1 score: 0.8137


In [None]:
model.train(n_epochs=20,
            lr=0.01,
            weight_decay=0,
            edge_val_ratio=0.1,
            edge_test_ratio=0.05,
            node_val_ratio=0.1,
            node_test_ratio=0.0,
            edge_batch_size=64,
            include_edge_recon_loss=True,
            include_gene_expr_recon_loss=True,
            mlflow_experiment_id=experiment.experiment_id)

In [None]:
model.save(dir_path="./model_artefacts",
           overwrite=True,
           save_adata=True,
           adata_file_name="adata.h5ad")

In [None]:
model = Autotalker.load(dir_path="./model_artefacts",
                        adata=None,
                        adata_file_name="adata.h5ad")

In [None]:
latent = model.get_latent_representation()

In [None]:
latent_new_data = model.get_latent_representation(adata)

In [None]:
adata.obsm["latent_autotalker"] = latent_new_data

## Interoperability with scanpy

In [None]:
sc.set_figure_params(figsize=(6, 6))

# Use autotalker latent space for UMAP generation
sc.pp.neighbors(adata, use_rep="latent_autotalker")
sc.tl.umap(adata, min_dist=0.3)
sc.pl.umap(adata, color=[cell_type_key], frameon=False,)

## NicheNet Gene Programs (GPs)

In [None]:
gp_data_folder_path = "datasets/gp_data"
gp_data_file_path = gp_data_folder_path + "/ligand_target_matrix.csv"
os.makedirs(gp_data_folder_path, exist_ok=True)

In [None]:
download_nichenet_ligand_target_mx(
    save_path=gp_data_file_path)

In [None]:
gp_dict = extract_gps_from_ligand_target_mx(
    path=gp_data_file_path)

In [None]:
mask_adata_with_gp_dict(adata, gp_dict)

In [None]:
mask = adata.varm["I"].T

In [None]:
mask.shape

In [None]:
mask.shape

In [None]:
I = [[int(gene in gp) for _, gp in gp_dict.items()] for gene in adata_genes]
I = np.asarray(I, dtype="int32")

In [None]:
gp_dict

In [None]:
I.sum()

In [None]:
for gp_name, gp in gp_dict.items():
    print(gp_name)
    print(gp)
    break

## SCVI

In [None]:
import scvi
import scanpy as sc
import matplotlib.pyplot as plt

sc.set_figure_params(figsize=(6, 6))

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

In [None]:
adata.layers["counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata # freeze the state in `.raw`

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts"
)

In [None]:
model = scvi.model.SCVI(adata)

In [None]:
model

In [None]:
model.train()

In [None]:
latent_scvi = model.get_latent_representation()

In [None]:
adata.obsm["X_scVI"] = latent_scvi

In [None]:
# run PCA then generate UMAP plots
sc.tl.pca(adata)
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata, min_dist=0.3)

In [None]:
sc.pl.umap(
    adata,
    color=["celltype_mapped_refined"],
    frameon=False,
)