# Converting BEL graphs into inputs for GAT2VEC.

In [3]:
import os

import bio2bel_phewascatalog
from GAT2VEC import paths as gat2vec_paths
import guilty_phewas.utils as phewas_utils
from networkx import DiGraph, write_adjlist
from networkx.relabel import convert_node_labels_to_integers
from pybel import from_url, from_path
from pybel.dsl import BaseEntity, gene, protein, rna
from pybel.struct.mutation.collapse import collapse_to_genes, collapse_all_variants
from pybel_tools.assembler.reified_graph import assembler

In [4]:
basedir = 'C:/Users/Mauricio/Thesis/bel_data'
tau_dict = {
    'basedir': os.path.join(basedir, "tau"),
    'url': 'https://raw.githubusercontent.com/neurommsig/neurommsig-knowledge' + \
           '/master/neurommsig_knowledge/Tau%20protein%20subgraph.bel',
    'localfile': os.path.join(basedir, "tau/Tau_subgraph.bel"),
    'struct_file': os.path.join(basedir, "tau/tau_graph.adjlist"),
    'attr_file': os.path.join(basedir, "tau/tau_na.adjlist")
}
gsk3_dict = {
    'basedir': os.path.join(basedir, "alzh"),
    'url': 'https://raw.githubusercontent.com/neurommsig/neurommsig-knowledge/' + \
           'master/neurommsig_knowledge/GSK3%20subgraph.bel',
    'localfile': os.path.join(basedir, "alzh/alzheimers.bel"),
    'struct_file': os.path.join(basedir, "alzh/alzh_graph.adjlist"),
    'attr_file': os.path.join(basedir, "alzh/alzh_na.adjlist")
}
param_dict = tau_dict

In [5]:
basedir = param_dict['basedir']

# possible inputs
url = param_dict['url']
local = param_dict['localfile']

# Output files 
struct_file = param_dict['struct_file']
attr_file = param_dict['attr_file']

## Read and reify a BEL graph.

In [6]:
# graph = from_url(url)
graph = from_path(local)

print("Nodes before", len(graph.nodes))
print("Edges before", len(graph.edges))

# Collapse all variants is removing also the pmod(X) from the BaseEntity
collapse_all_variants(graph)
# TODO collapse_to_genes removes the ptm information before converting
# collapse_to_genes(graph)

print("Nodes collapsed", len(graph.nodes))
print("Edges collapsed", len(graph.edges))

# Reify the edges
rbg = assembler.reify_bel_graph(graph)

print("Nodes after", len(rbg.nodes))
print("Edges after", len(rbg.edges))

C:/Users/Mauricio/Thesis/bel_data\tau/Tau_subgraph.bel:0:0 MissingMetadataException Missing required document metadata: Authors
C:/Users/Mauricio/Thesis/bel_data\tau/Tau_subgraph.bel:0:0 MissingMetadataException Missing required document metadata: Description


Nodes before 284
Edges before 836
Nodes collapsed 230
Edges collapsed 772




Nodes after 625
Edges after 892


## Assess the converted entity and predicate nodes.

In [7]:
qty_predicate = {}
qty_prot, qty_rna = 0, 0
for i in rbg.nodes:
    if isinstance(i, BaseEntity):
        if isinstance(i, rna):
            qty_rna += 1
        elif isinstance(i, protein):
            qty_prot += 1
    else:
        if rbg.nodes[i]['label'] in qty_predicate:
            qty_predicate[rbg.nodes[i]['label']] += 1
        else:
            qty_predicate[rbg.nodes[i]['label']] = 1

print("Predicates")
print(qty_predicate)

print(f"Proteins {qty_prot}")
print(f"RNA      {qty_rna}")

Predicates
{'abundance': 374, 'activates': 18, 'hasComponent': 45, 'translates': 3, 'degradates': 6}
Proteins 97
RNA      1


## Adding PheWAS annotation (as graph) -- Skipping it for now.

phewas_manager = bio2bel_phewascatalog.Manager()
pw_dict = phewas_manager.to_dict()

file = "/home/mau/Uni/Thesis/Phewas-catalog/phewas/phewas-catalog.csv"
attr_rbg = DiGraph()

for node in rbg.nodes:
    if ((isinstance(node, protein) or
         isinstance(node, rna)) and
            node['name'] in pw_dict):
        for odds, phtype in pw_dict[node['name']]:
            if phtype:
                attr_rbg.add_edge(node, phtype, label=odds)

print("before phewas", len(rbg.nodes))
print("after phewas", len(attr_rbg.nodes))

## Adding PheWAS annotation (as attributes).

In [8]:
phewas_manager = bio2bel_phewascatalog.Manager()
pw_dict = phewas_manager.to_dict()

PheWAS Catalog - generating Dict: 100%|██████████████████████████████████████| 215107/215107 [00:23<00:00, 9191.99it/s]


In [19]:
# TODO put the phewas as an attribute (like GT)

# TODO Generate disease hashing (like GT)

start_ind = len(rbg.nodes)
phenotypes_list = set([phe for _list in pw_dict.values()
                           for odds, phe in _list])
# unique_phenotypes = set().union(*pw_dict.values())
enum_phenotypes = enumerate(phenotypes_list, start=start_ind+1)
att_mappings = {phenot: num for num, phenot in enum_phenotypes}
len(att_mappings)

1354

In [20]:
# create python file
def add_disease_attribute(graph, att_mappings):
    """Add the phenotypes to the Base Entities as attributes."""
    for node in graph.nodes:
        if ((isinstance(node, protein) or
             isinstance(node, rna) or
             isinstance(node, gene)) and
               node.name in att_mappings):
            graph.nodes[node]['phenotypes'] = [phtype for _, phtype in att_mappings[node.name]]
    
def write_adj_file_attribute(graph, filepath: str):
    """Write an adjacency file from the attribute graph."""
    with open(filepath, 'w') as f:
        for node in graph.nodes:
            if 'phenotypes' in graph.nodes[node]:  # "There are diseases in the node":
                print(f"{node} {' '.join(str(att_mappings[phe]) for phe in graph.nodes[node]['phenotypes'])}", file=f)

In [21]:
annotated_graph = rbg.copy()
phewas_utils.add_disease_attribute(annotated_graph, pw_dict)
"""for node in annotated_graph.nodes:
    if 'phenotypes' in annotated_graph.nodes[node]:
        print(annotated_graph.nodes[node]['phenotypes'], node.name)
        break""";


## Write adjacency list to file for GAT2VEC

In [22]:
# Structure graph

out_rbg = convert_node_labels_to_integers(test, first_label=1, label_attribute='old_label')
write_adjlist(out_rbg, struct_file)

phewas_utils.write_adj_file_attribute(out_rbg, attr_file, att_mappings)

# Attribute graph
# out_rbg = convert_node_labels_to_integers(test, first_label=1,label_attribute='old_label')
# write_adjlist(out_rbg, attr_file)

[('HLA-DQB1', 726), ('CDKN2B-AS1', 910), ('HLA-DRA', 539)]
