In [4]:
import leidenalg
import scanpy, phate
import numpy as np
import scprep
import matplotlib.pyplot as plt
import pandas as pd
import gspa
from scipy.spatial.distance import cdist

In [7]:
acute = scanpy.read_h5ad('data/acute_tcells.h5ad')
chronic = scanpy.read_h5ad('data/chronic_tcells.h5ad')
adata = scanpy.concat((acute,chronic))
adata.obs['batch'] = adata.obs['batch'].astype('category')

In [8]:
data, data_hvgs = scprep.select.highly_variable_genes(adata.to_df(), adata.var_names, percentile=90)
data_hvg = data[data_hvgs]
data_hvg = data_hvg / np.linalg.norm(data_hvg, axis=0)

In [9]:
uniform_signal = np.ones((1, adata.n_obs))
uniform_signal = uniform_signal / np.linalg.norm(uniform_signal, axis=1).reshape(-1,1)

In [10]:
signal_reduced = gspa.embedding.svd(data_hvg.T)

In [12]:
results = {}
results['signal_embedding'] = gspa.embedding.run_ae(signal_reduced)
results['localization_score'] = cdist(uniform_signal, data_hvg.T).reshape(-1,)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


### cluster analysis

In [13]:
gene_phate_op = phate.PHATE(random_state=0)
data_phate = gene_phate_op.fit_transform(results['signal_embedding'])

Calculating PHATE...
  Running PHATE on 1416 observations and 128 variables.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 0.25 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.24 seconds.
    Calculating affinities...
    Calculated affinities in 0.09 seconds.
  Calculated graph and diffusion operator in 0.59 seconds.
  Calculating optimal t...
    Automatically selected t = 10
  Calculated optimal t in 0.52 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.14 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 1.84 seconds.
Calculated PHATE in 3.09 seconds.


In [14]:
gene_clusters = np.array(leidenalg.find_partition(gene_phate_op.graph.to_igraph(), 
                                                  leidenalg.ModularityVertexPartition,
                                                  seed=0).membership)
gene_clusters = pd.Series(gene_clusters, index=data_hvgs.values)

In [15]:
for module in range(gene_clusters.max() + 1):
    gene_list = gene_clusters[gene_clusters == module].index
    
    scanpy.tl.score_genes(adata, gene_list=gene_list,
                          random_state=0, score_name=f'GM_{module}_score',
                          gene_pool=data_hvgs.values, n_bins=int(len(data_hvgs.values) * 0.25))

In [None]:
np.savez('./results/Signals_gene_embedding.npz', signal_embedding=results['signal_embedding'],
         localization_score=results['localization_score'], genes=data_hvgs.values, clusters=gene_clusters.values)