In [1]:
import anndata as ad
import networkx as nx
import numpy as np
import pandas as pd
import scglue
import seaborn as sns
from IPython import display
from matplotlib import rcParams
from networkx.algorithms.bipartite import biadjacency_matrix
from networkx.drawing.nx_agraph import graphviz_layout
import os
from itertools import chain
import itertools

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
scglue.plot.set_publication_params()
rcParams['figure.figsize'] = (4, 4)

In [3]:
os.chdir('/home/gaojie/workspace/Mida_collab/')

In [4]:
rna = ad.read_h5ad("Multi-omics_intergration/output/adata_rna_output.h5ad")
atac = ad.read_h5ad("Multi-omics_intergration/output/adata_atac_output.h5ad")
guidance = nx.read_graphml("Multi-omics_intergration/data/guidance.graphml.gz")

### adding varm embedding

In [5]:
glue = scglue.models.load_model("Multi-omics_intergration/output/glue.dill")

[INFO] autodevice: Using CPU as computation device.


In [6]:
guidance_hvf = guidance.subgraph(chain(
    rna.var.query("highly_variable").index,
    atac.var.query("highly_variable").index
)).copy()

In [7]:
feature_embeddings = glue.encode_graph(guidance_hvf)
feature_embeddings = pd.DataFrame(feature_embeddings, index=glue.vertices)
feature_embeddings.iloc[:5, :5]

Unnamed: 0,0,1,2,3,4
A1CF,0.047459,0.023915,0.026625,-0.035747,0.158627
A2M,0.162114,-0.026905,-0.056977,0.045769,1.043423
AASS,0.171789,-0.02134,0.000554,-0.194266,-0.824317
AATBC,-0.056287,0.020633,-0.015293,-0.107614,0.168682
ABAT,0.23613,-0.015744,-0.002474,-0.075696,-0.422809


In [8]:
feature_embeddings.shape

(117046, 50)

In [9]:
rna.varm["X_glue"] = feature_embeddings.reindex(rna.var_names).to_numpy()
atac.varm["X_glue"] = feature_embeddings.reindex(atac.var_names).to_numpy()

### regulatory inference

In [10]:
rna.var["name"] = rna.var_names
atac.var["name"] = atac.var_names

In [11]:
genes = rna.var.query("highly_variable").index
peaks = atac.var.query("highly_variable").index

In [12]:
len(genes)+len(peaks)

117046

In [13]:
features = pd.Index(np.concatenate([rna.var_names, atac.var_names]))
feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]])

In [14]:
skeleton = guidance_hvf.edge_subgraph(
    e for e, attr in dict(guidance_hvf.edges).items()
    if attr["type"] == "fwd"
).copy()

In [15]:
reginf = scglue.genomics.regulatory_inference(
    features, feature_embeddings,
    skeleton=skeleton, random_state=0
)

regulatory_inference: 100%|██████████| 116271/116271 [00:01<00:00, 114817.71it/s]


In [16]:
gene2peak = reginf.edge_subgraph(
    e for e, attr in dict(reginf.edges).items()
    if attr["qval"] < 0.05
)

In [17]:
scglue.genomics.Bed(atac.var).write_bed("Multi-omics_intergration/output/peaks.bed", ncols=3)
scglue.genomics.write_links(
    gene2peak,
    scglue.genomics.Bed(rna.var).strand_specific_start_site(),
    scglue.genomics.Bed(atac.var),
    "Multi-omics_intergration/output/gene2peak_all.links", keep_attrs=["score",'pval','qval'] ###saving links for CellOracle input
)

In [18]:
data = {
    'peak_id':[],
    'gene_short_name':[],
    'qval':[],
    'score':[],
    'pval':[]
}
for u,v,attr in gene2peak.edges(data=True):
    data['peak_id'].append(v)
    data['gene_short_name'].append(u)
    data['qval'].append(attr['qval'])
    data['score'].append(attr['score'])
    data['pval'].append(attr['pval'])

In [19]:
gene2peak_df = pd.DataFrame(data)
gene2peak_df.head()

Unnamed: 0,peak_id,gene_short_name,qval,score,pval
0,chr17:7662026-7662526,TP53,0.006082,0.873452,0.00117
1,chr17:7665809-7666309,TP53,0.006082,0.89791,0.000808
2,chr17:7666985-7667485,TP53,0.006082,0.900482,0.000757
3,chr17:7668423-7668923,TP53,0.006082,0.887318,0.000998
4,chr17:7669021-7669521,TP53,0.006082,0.873999,0.001152


In [20]:
gene2peak_df.to_csv('Multi-omics_intergration/output/gene2peak_df_all.csv')