In [7]:
import leidenalg
import scanpy, phate
import numpy as np
import scprep
import pandas as pd
import gspa
from DiffusionEMD import DiffusionCheb

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
2024-03-21 18:10:01.626160: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
  register_backend(TensorflowBackend())


In [12]:
from types import SimpleNamespace
config_args = {
    'lr': (0.001, 'learning rate'),
        'dropout': (0.0, 'dropout probability'),
        'epochs': (100, 'maximum number of epochs to train for'),
        'weight_decay': (0., 'l2 regularization strength'),
        'patience': (10, 'patience for early stopping'),
        'min_epochs': (50, 'do not early stop before min_epochs'),
        'seed': (1234, 'seed for training'),
        'walk_length': (80, 'walk length for Node2Vec'),
        'num_walks': (10, 'number of walks for Node2Vec'),
        'task': ('localization', 'task for paper, can be any of [coexpression, localization]'),
        'verbose': (1, 'verbosity, of [0, 1, 2]'),
        'add_pseudotime': (0, 'whether to add time signal (1) or not (0)'),
        'model': ('GSPA', 'which model to use, can be one of [Signals, DiffusionEMD, GFMMD, GSPA, GSPA_QR, MAGIC, Node2Vec_Gcell, GAE_noatt_Gcell, GAE_att_Gcell, Node2Vec_Ggene, GAE_noatt_Ggene, GAE_att_Ggene]'),
        'dim': (128, 'embedding dimension'),
        'num_layers': (2, 'number of hidden layers in encoder'),
        'bias': (1, 'whether to use bias (1) or not (0)'),
        'act': ('relu', 'which activation function to use of [relu, tanh, None]'),
        'k_neighbors_gene': (5, 'default number of neighbors k for kNN gene graph construction'),
        'device': ('cpu', 'Device for model'),
        'val_prop': (0.05, 'proportion of validation'),
        'test_prop': (0.0, 'proportion of test'),
        'split_seed': (1234, 'seed for data splits (train/test/val)'),
        'save_as': ('0', 'name for embedding iteration'),
        'k_neighbors_cell': (5, 'default number of neighbors k for kNN cell graph construction'),
        'kernel': ('adaptive', 'which kernel to use, can be one of [adaptive, kNN, SNN]'),
        'J': (-1, 'maximum scale for GSPA or GSPA_QR, by default int(log(N))')
}
config_args = pd.DataFrame([x[0] for x in config_args.values()], config_args.keys())[0].to_dict()
args = SimpleNamespace(**config_args, )

In [2]:
acute = scanpy.read_h5ad('data/acute_tcells.h5ad')
chronic = scanpy.read_h5ad('data/chronic_tcells.h5ad')
adata = scanpy.concat((acute,chronic))
adata.obs['batch'] = adata.obs['batch'].astype('category')

In [3]:
phate_op = phate.PHATE(random_state=42, n_jobs=-1, knn=30)
adata.obsm['X_phate'] = phate_op.fit_transform(adata.to_df())

Calculating PHATE...
  Running PHATE on 39704 observations and 14152 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 52.29 seconds.
    Calculating KNN search...
    Calculated KNN search in 438.76 seconds.
    Calculating affinities...
    Calculated affinities in 53.38 seconds.
  Calculated graph and diffusion operator in 548.31 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 68.45 seconds.
    Calculating KMeans...
    Calculated KMeans in 20.17 seconds.
  Calculated landmark operator in 94.37 seconds.
  Calculating optimal t...
    Automatically selected t = 20
  Calculated optimal t in 7.96 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 2.38 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 14.67 seconds.
Calculated PHATE in 667.75 seconds.


In [4]:
data, data_hvgs = scprep.select.highly_variable_genes(adata.to_df(), adata.var_names, percentile=90)
data_hvg = data[data_hvgs]
data_hvg = data_hvg / np.linalg.norm(data_hvg, axis=0)

In [5]:
uniform_signal = np.ones((1, adata.n_obs))
uniform_signal = uniform_signal / np.linalg.norm(uniform_signal, axis=1).reshape(-1,1)

In [6]:
signals_t = data_hvg.values
signal_prob = signals_t
signal_prob = signals_t / signals_t.sum(axis=0)
uniform_signal_t = uniform_signal.reshape(-1, 1)
uniform_prob = uniform_signal_t / uniform_signal_t.sum(axis=0)

In [8]:
results = {}

dc_op = DiffusionCheb()
signal_representation = dc_op.fit_transform(phate_op.graph.to_pygsp().A, np.hstack((signal_prob, uniform_prob)))
uniform_representation = signal_representation[-1]
signal_representation = signal_representation[:-1]
results['localization_score'] = cdist(uniform_representation, signal_representation, metric='cityblock').reshape(-1,)

In [None]:
signal_reduced = gspa.embedding.svd(signal_representation)
results['signal_embedding'] = gspa.embedding.run_ae(signal_reduced, args)

### cluster analysis

In [None]:
gene_phate_op = phate.PHATE(random_state=0)
data_phate = gene_phate_op.fit_transform(results['signal_embedding'])

In [None]:
gene_clusters = np.array(leidenalg.find_partition(gene_phate_op.graph.to_igraph(), 
                                                  leidenalg.ModularityVertexPartition,
                                                  seed=0).membership)
gene_clusters = pd.Series(gene_clusters, index=data_hvgs.values)

In [None]:
np.savez('./results/DiffusionEMD_gene_embedding.npz', 
         signal_embedding=results['signal_embedding'],
         localization_score=results['localization_score'],
         genes=data_hvgs.values, clusters=gene_clusters.values)