# Reference-Guided RNA Inference from ATAC

In [1]:
# Import packages
import os
import pandas as pd
import numpy as np
import scanpy as sc
import nmslib
from scipy.sparse import csr_matrix, vstack

In [2]:
# Load reference
refA = np.load("scM_human_RNA_and_ATAC_bs16_directOut0/atac_cell_embs.npy")
refC = np.load("scM_human_RNA_and_ATAC_bs16_directOut0/cell_names.npy")

In [3]:
# Reference h5ad
files = os.listdir("scM_human_h5ad")
files = pd.DataFrame({"sample": [file.split(".")[0] for file in files], "file": [f"scM_human_h5ad/{file}" for file in files]})

In [4]:
# Load data
atac = np.load("hPbmc_RNA_and_ATAC_bs16_directOut0/atac_cell_embs.npy")
cell = np.load("hPbmc_RNA_and_ATAC_bs16_directOut0/cell_names.npy")
print(f"ATAC: {atac.shape}")

ATAC: (9631, 512)


In [5]:
# Find neighbors
index_params = {"M": 64, "efConstruction": 128}
query_params = {"efSearch": 256}
index = nmslib.init(method = "hnsw", space = "cosinesimil", data_type = nmslib.DataType.DENSE_VECTOR)
index.addDataPointBatch(refA)
index.createIndex(index_params, print_progress = False)
index.setQueryTimeParams(query_params)
neighbours = index.knnQueryBatch(atac, k = 128)
indices = np.vstack([i[0] for i in neighbours]).astype(np.int32)
distances = np.vstack([i[1] for i in neighbours])
print(f"Indices: {indices.shape}")

Indices: (9631, 128)


In [6]:
# Neighbors' metadata
neighbors = set(refC[indices].flatten())
neighbors = pd.DataFrame({
    "sample": [neighbor.split("#")[0] for neighbor in neighbors],
    "cell": [neighbor for neighbor in neighbors]
}).drop_duplicates().sort_values(by = ["sample", "cell"]).reset_index(drop = True)
print(neighbors.head())

   sample                       cell
0  M00007  M00007#AAACCGGCACATAGCC-1
1  M00007  M00007#AAAGCCCGTTCATCTA-1
2  M00007  M00007#AAAGCTTGTGGCTTCC-1
3  M00007  M00007#AACAGCAAGTTAGGCT-1
4  M00007  M00007#AACCGCTCAGCACGTT-1


In [7]:
# Read expression
expression = None
for i in range(len(files)):
    sample = files["sample"].iloc[i]
    cells = neighbors.loc[neighbors["sample"] == sample]
    if len(cells) == 0: continue
    adata = sc.read_h5ad(files["file"].iloc[i])
    if expression is None:
        expression = sc.concat([adata[cells["cell"].values]], merge = "same")
    else:
        expression = sc.concat([expression, adata[cells["cell"].values]], merge = "same")
    del adata
sc.pp.normalize_total(expression, target_sum = 1e4)
print(expression)

AnnData object with n_obs × n_vars = 87828 × 19365
    obs: 'sample', 'cell_id'
    var: 'gene_symbols', 'feature_types'


In [8]:
# Aggregate neighbors
neighbours = refC[indices]
mat = [csr_matrix(expression[n].X.mean(axis = 0)) for n in neighbours]
prediction = sc.AnnData(X = vstack(mat), var = expression.var)
prediction.obs_names = cell
prediction.write("predict.gex.h5ad", compression = "gzip")

In [9]:
# Read truth
truth = sc.read_h5ad("hPbmc.gex.h5ad")
truth = truth[truth.obs_names.isin(prediction.obs_names), truth.var["gene_id"].isin(prediction.var_names)].copy()
prediction = prediction[truth.obs_names, truth.var["gene_id"].values].copy()
prediction.var = truth.var.copy()
print(truth)

AnnData object with n_obs × n_vars = 9631 × 19365
    obs: 'sample', 'cell_id', 'cell_type'
    var: 'gene_id', 'gene_name', 'gene_type', 'interval'


In [10]:
# Evaluate correlation
r = [np.corrcoef(truth[c].X.toarray(), prediction[c].X.toarray())[0][1] for c in prediction.obs_names]
print(f"Pearson correlation: {np.mean(r)}")

Pearson correlation: 0.8352406711820752
