In [2]:
import sklearn.neighbors as neighbors
import pygsp, leidenalg
import scanpy, scprep
import phate
import numpy as np
import pandas as pd
from run.run_node2vec import run_node2vec
from run.run_gae import run_gae
from scipy.spatial.distance import cdist

ModuleNotFoundError: No module named 'run'

In [None]:
from types import SimpleNamespace
config_args = {
    'lr': (0.001, 'learning rate'),
        'dropout': (0.0, 'dropout probability'),
        'epochs': (100, 'maximum number of epochs to train for'),
        'weight_decay': (0., 'l2 regularization strength'),
        'patience': (10, 'patience for early stopping'),
        'min_epochs': (50, 'do not early stop before min_epochs'),
        'seed': (1234, 'seed for training'),
        'walk_length': (80, 'walk length for Node2Vec'),
        'num_walks': (10, 'number of walks for Node2Vec'),
        'task': ('localization', 'task for paper, can be any of [coexpression, localization]'),
        'verbose': (1, 'verbosity, of [0, 1, 2]'),
        'add_pseudotime': (0, 'whether to add time signal (1) or not (0)'),
        'model': ('GSPA', 'which model to use, can be one of [Signals, DiffusionEMD, GFMMD, GSPA, GSPA_QR, MAGIC, Node2Vec_Gcell, GAE_noatt_Gcell, GAE_att_Gcell, Node2Vec_Ggene, GAE_noatt_Ggene, GAE_att_Ggene]'),
        'dim': (128, 'embedding dimension'),
        'num_layers': (2, 'number of hidden layers in encoder'),
        'bias': (1, 'whether to use bias (1) or not (0)'),
        'act': ('relu', 'which activation function to use of [relu, tanh, None]'),
        'k_neighbors_gene': (5, 'default number of neighbors k for kNN gene graph construction'),
        'device': ('cpu', 'Device for model'),
        'val_prop': (0.05, 'proportion of validation'),
        'test_prop': (0.0, 'proportion of test'),
        'split_seed': (1234, 'seed for data splits (train/test/val)'),
        'save_as': ('0', 'name for embedding iteration'),
        'k_neighbors_cell': (5, 'default number of neighbors k for kNN cell graph construction'),
        'kernel': ('adaptive', 'which kernel to use, can be one of [adaptive, kNN, SNN]'),
        'J': (-1, 'maximum scale for GSPA or GSPA_QR, by default int(log(N))')
}
config_args = pd.DataFrame([x[0] for x in config_args.values()], config_args.keys())[0].to_dict()
args = SimpleNamespace(**config_args, )

In [4]:
acute = scanpy.read_h5ad('data/acute_tcells.h5ad')
chronic = scanpy.read_h5ad('data/chronic_tcells.h5ad')
adata = scanpy.concat((acute,chronic))
adata.obs['batch'] = adata.obs['batch'].astype('category')

In [13]:
data, data_hvgs = scprep.select.highly_variable_genes(adata.to_df(), adata.var_names, percentile=90)
data_hvg = data[data_hvgs]
data_hvg = data_hvg / np.linalg.norm(data_hvg, axis=0)

In [14]:
uniform_signal = np.ones((1, adata.n_obs))
uniform_signal = uniform_signal / np.linalg.norm(uniform_signal, axis=1).reshape(-1,1)

In [16]:
signal_graph = neighbors.kneighbors_graph(data_hvg.T, n_neighbors=5)
signal_graph = pygsp.graphs.Graph(signal_graph)
signal_graph.W = signal_graph.A

In [17]:
# localization calculation requires separate graph with uniform signal
signals_with_uniform = np.vstack((data_hvg.T, uniform_signal))
signals_with_uniform_graph = sklearn.neighbors.kneighbors_graph(signals_with_uniform, n_neighbors=5)
signals_with_uniform_graph = pygsp.graphs.Graph(signals_with_uniform_graph)
signals_with_uniform_graph.W = signals_with_uniform_graph.A

#### Node2Vec_Ggene

In [27]:
results = {}
results['signal_embedding'] = run_node2vec(signal_graph, args=args)
signals_with_uniform_embedding = run_node2vec(signals_with_uniform_graph, args=args)
results['localization_score'] = cdist(signals_with_uniform_embedding[-1], signals_with_uniform_embedding[:-1]).reshape(-1,)

Computing transition probabilities:   0%|          | 0/1416 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:27<00:00,  2.75s/it]


Computing transition probabilities:   0%|          | 0/1417 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:27<00:00,  2.78s/it]


In [30]:
gene_phate_op = phate.PHATE(random_state=0)
data_phate = gene_phate_op.fit_transform(results['signal_embedding'])

Calculating PHATE...
  Running PHATE on 1416 observations and 128 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 0.30 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.70 seconds.
    Calculating affinities...
    Calculated affinities in 0.03 seconds.
  Calculated graph and diffusion operator in 1.05 seconds.
  Calculating optimal t...
    Automatically selected t = 20
  Calculated optimal t in 3.29 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.77 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 7.09 seconds.
Calculated PHATE in 12.27 seconds.


In [33]:
gene_clusters = np.array(leidenalg.find_partition(gene_phate_op.graph.to_igraph(), 
                                                  leidenalg.ModularityVertexPartition,
                                                  seed=0).membership)
gene_clusters = pd.Series(gene_clusters, index=data_hvgs.values)

In [34]:
np.savez('./results/Node2Vec_Ggene_gene_embedding.npz', 
         signal_embedding=results['signal_embedding'],
         localization_score=results['localization_score'],
         genes=data_hvgs.values, clusters=gene_clusters.values)

#### GAE_noatt_Ggene

In [94]:
results = {}
args.attention = False
results['signal_embedding'] = run_gae(signal_graph, args)
signals_with_uniform_embedding = run_gae(signals_with_uniform_graph, args)
results['localization_score'] = cdist(signals_with_uniform_embedding[-1], signals_with_uniform_embedding[:-1]).reshape(-1,)

In [95]:
gene_phate_op = phate.PHATE(random_state=0)
data_phate = gene_phate_op.fit_transform(results['signal_embedding'])

Calculating PHATE...
  Running PHATE on 1416 observations and 128 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 0.37 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.56 seconds.
    Calculating affinities...
    Calculated affinities in 0.04 seconds.
  Calculated graph and diffusion operator in 1.00 seconds.
  Calculating optimal t...
    Automatically selected t = 27
  Calculated optimal t in 5.00 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.99 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 4.04 seconds.
Calculated PHATE in 11.05 seconds.


In [96]:
gene_clusters = np.array(leidenalg.find_partition(gene_phate_op.graph.to_igraph(), 
                                                  leidenalg.ModularityVertexPartition,
                                                  seed=0).membership)
gene_clusters = pd.Series(gene_clusters, index=data_hvgs.values)

In [97]:
np.savez('./results/GAE_noatt_Ggene_gene_embedding.npz', 
         signal_embedding=results['signal_embedding'],
         localization_score=results['localization_score'],
         genes=data_hvgs.values, clusters=gene_clusters.values)

#### GAE_att_Ggene

In [98]:
results = {}
args.attention = True
results['signal_embedding'] = run_gae(signal_graph, args)
signals_with_uniform_embedding = run_gae(signals_with_uniform_graph, args)
results['localization_score'] = cdist(signals_with_uniform_embedding[-1], signals_with_uniform_embedding[:-1]).reshape(-1,)

In [99]:
gene_phate_op = phate.PHATE(random_state=0)
data_phate = gene_phate_op.fit_transform(results['signal_embedding'])

Calculating PHATE...
  Running PHATE on 1416 observations and 128 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 0.21 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.48 seconds.
    Calculating affinities...
  Calculated graph and diffusion operator in 0.70 seconds.
  Calculating optimal t...




    Automatically selected t = 26
  Calculated optimal t in 2.43 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.88 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 5.08 seconds.
Calculated PHATE in 9.11 seconds.


In [100]:
gene_clusters = np.array(leidenalg.find_partition(gene_phate_op.graph.to_igraph(), 
                                                  leidenalg.ModularityVertexPartition,
                                                  seed=0).membership)
gene_clusters = pd.Series(gene_clusters, index=data_hvgs.values)

In [101]:
np.savez('./results/GAE_att_Ggene_gene_embedding.npz', 
         signal_embedding=results['signal_embedding'],
         localization_score=results['localization_score'],
         genes=data_hvgs.values, clusters=gene_clusters.values)