In [1]:
import os
import json
from tqdm import tqdm
from collections import defaultdict

import pickle

In [2]:
raw_data_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/raw_data/biology"
save_dir="/home/ec2-user/quic-efs/user/bowenjin/llm-graph-plugin/data/processed_data/biology"

In [3]:
## read raw data files

def read_json(file):
    with open(file) as f:
        data = json.load(f)
    return data

raw_data = json.load(open(os.path.join(raw_data_dir, 'hetionet-v1.0.json')))

In [4]:
raw_data.keys()

dict_keys(['metanode_kinds', 'metaedge_tuples', 'kind_to_abbrev', 'nodes', 'edges'])

In [5]:
raw_data['metanode_kinds']

['Anatomy',
 'Biological Process',
 'Cellular Component',
 'Compound',
 'Disease',
 'Gene',
 'Molecular Function',
 'Pathway',
 'Pharmacologic Class',
 'Side Effect',
 'Symptom']

In [6]:
raw_data['nodes']

[{'kind': 'Molecular Function',
  'identifier': 'GO:0031753',
  'name': 'endothelial differentiation G-protein coupled receptor binding',
  'data': {'source': 'Gene Ontology',
   'license': 'CC BY 4.0',
   'url': 'http://purl.obolibrary.org/obo/GO_0031753'}},
 {'kind': 'Side Effect',
  'identifier': 'C0023448',
  'name': 'Lymphocytic leukaemia',
  'data': {'source': 'UMLS via SIDER 4.1',
   'license': 'CC BY-NC-SA 4.0',
   'url': 'http://identifiers.org/umls/C0023448'}},
 {'kind': 'Gene',
  'identifier': 5345,
  'name': 'SERPINF2',
  'data': {'description': 'serpin peptidase inhibitor, clade F (alpha-2 antiplasmin, pigment epithelium derived factor), member 2',
   'source': 'Entrez Gene',
   'license': 'CC0 1.0',
   'url': 'http://identifiers.org/ncbigene/5345',
   'chromosome': '17'}},
 {'kind': 'Gene',
  'identifier': 9409,
  'name': 'PEX16',
  'data': {'description': 'peroxisomal biogenesis factor 16',
   'source': 'Entrez Gene',
   'license': 'CC0 1.0',
   'url': 'http://identifier

In [7]:
metaedges = {
    "Anatomy-downregulates-Gene": "The Gene is under-expressed in the Anatomy in post-juvenile adult human samples according to Bgee",
    "Anatomy-expresses-Gene": "The Gene is over-expressed in the Anatomy according to Bgee or TISSUES",
    "Anatomy-upregulates-Gene": "The Gene is over-expressed in the Anatomy in post-juvenile adult human samples according to Bgee",
    "Compound-binds-Gene": "The Compound physically binds to the Gene's protein product according to BindingDB, DrugBank, or DrugCentral",
    "Compound-causes-Side Effect": "The Compound can cause the Side Effect according to a drug label in SIDER 4.1",
    "Compound-downregulates-Gene": "The Gene is under-expressed in samples exposed to the Compound according to a consensus signature from LINCS L1000",
    "Compound-palliates-Disease": "The Compound is a symptomatic indication for the Disease (i.e. the Compound treats a significant symptom of the Disease) according to PharmacotherapyDB",
    "Compound-resembles-Compound": "The Compounds are chemical similar to each other: Their extended connectivity fingerprints have a Dice coefficient ≥ 0.5.",
    "Compound-treats-Disease": "The Compound is a disease-modifying indication for the Disease (i.e. the Compound therapeutically changes the underlying or downstream biology of the Disease) according to PharmacotherapyDB",
    "Compound-upregulates-Gene": "The Gene is over-expressed in samples exposed to the Compound according to a consensus signature from LINCS L1000",
    "Disease-associates-Gene": "The Gene is associated with the Gene according to GWAS Catalog, DISEASES, DisGeNET, or DOAF",
    "Disease-downregulates-Gene": "The Gene is under-expressed in Disease samples according to a STARGEO meta-analysis",
    "Disease-localizes-Anatomy": "The Disease affects or is related to the Anatomy as inferred by co-occurrence in MEDLINE abstracts",
    "Disease-presents-Symptom": "The Disease presents the Anatomy as inferred by co-occurrence in MEDLINE abstracts",
    "Disease-resembles-Disease": "Two Diseases are similar as inferred by co-occurrence in MEDLINE abstracts",
    "Disease-upregulates-Gene": "The Gene is over-expressed in Disease samples according to a STARGEO meta-analysis",
    "Gene-covaries-Gene": "Two genes have similar evolutionary histories in mammals, according to Evolutionary Rate Covariation ≥ 0.7 (Priedigkeit et al, 2015)",
    "Gene-interacts-Gene": "The genes produce physically-interacting proteins in humans according to the Human Reference Protein Interactome Mapping Project, the Incomplete Interactome, or other PPI databases previously used by Hetio",
    "Gene-participates-Biological Process": "The Gene's product participates in the Biological Process (or its subterms) according to Gene Ontology Annotations",
    "Gene-participates-Cellular Component": "The Gene's product participates in the Cellular Component (or its subterms) according to Gene Ontology Annotations",
    "Gene-participates-Molecular Function": "The Gene's product participates in the Molecular Function (or its subterms) according to Gene Ontology Annotations",
    "Gene-participates-Pathway": "The Gene participates in the Pathway, according WikiPathways, Reactome, or Pathway Interaction Database",
    "Gene-regulates-Gene": "Knockdown or over-expression of the source Gene significantly dysregulated the target Gene according to a consensus signature from LINCS L1000 genetic interference screens",
    "Pharmacologic Class-includes-Compound": "The Compound belongs to the Pharmacologic Class according to DrugCentral"
  }

In [8]:
## construct node dictionary

Anatomy_nodes = {}
Biological_Process_nodes = {}
Cellular_Component_nodes = {}
Compound_nodes = {}
Disease_nodes = {}
Gene_nodes = {}
Molecular_Function_nodes = {}
Pathway_nodes = {}
Pharmacologic_Class_nodes = {}
Side_Effect_nodes = {}
Symptom_nodes = {}

id_set = set()

## add nodes
for nd in tqdm(raw_data['nodes']):
    node_type = '_'.join(nd['kind'].split(' '))
    idd = str(nd['identifier'])
    assert idd not in id_set
    id_set.add(idd)

    exec(f"{node_type}_nodes[idd]=dict(features=dict(), neighbors=defaultdict(list))")
    exec(f"{node_type}_nodes[idd]['features']['name']=nd['name']")
    for k in nd['data']:
        if k not in ['source', 'license']:
            exec(f"{node_type}_nodes[idd]['features'][k]=nd['data'][k]")
    exec(f"{node_type}_nodes[idd]['features']['node_type']=node_type")

print(len(Anatomy_nodes), len(Biological_Process_nodes), len(Cellular_Component_nodes), len(Compound_nodes), len(Disease_nodes), len(Gene_nodes), len(Molecular_Function_nodes), len(Pathway_nodes), len(Pharmacologic_Class_nodes), len(Side_Effect_nodes), len(Symptom_nodes))
print(len(Anatomy_nodes)+len(Biological_Process_nodes)+len(Cellular_Component_nodes)+len(Compound_nodes)+len(Disease_nodes)+len(Gene_nodes)+len(Molecular_Function_nodes)+len(Pathway_nodes)+len(Pharmacologic_Class_nodes)+len(Side_Effect_nodes)+len(Symptom_nodes))

## add edges
for eg in tqdm(raw_data['edges']):
    source_node_type = '_'.join(eg['source_id'][0].split(' '))
    source_idd = str(eg['source_id'][1])
    target_node_type = '_'.join(eg['target_id'][0].split(' '))
    target_idd = str(eg['target_id'][1])
    edge_type = eg['source_id'][0] + '-' + eg['kind'] + '-' + eg['target_id'][0]

    assert edge_type in metaedges, edge_type

    #exec(f"{source_node_type}_nodes[source_idd]['neighbors'][edge_type]=[target_idd] if edge_type not in {source_node_type}_nodes[source_idd]['neighbors'] else {source_node_type}_nodes[source_idd]['neighbors'][edge_type].append(target_idd)")
    exec(f"{source_node_type}_nodes[source_idd]['neighbors'][edge_type].append(target_idd)")
    if eg['direction'] == 'both':
        exec(f"{target_node_type}_nodes[target_idd]['neighbors'][edge_type].append(source_idd)")

  0%|          | 0/47031 [00:00<?, ?it/s]

100%|██████████| 47031/47031 [00:03<00:00, 14182.50it/s]


402 11381 1391 1552 137 20945 2884 1822 345 5734 438
47031


100%|██████████| 2250197/2250197 [01:05<00:00, 34294.42it/s]


In [9]:
## save graph

json.dump({
    'Anatomy_nodes': Anatomy_nodes,
    'Biological_Process_nodes': Biological_Process_nodes,
    'Cellular_Component_nodes': Cellular_Component_nodes,
    'Compound_nodes': Compound_nodes,
    'Disease_nodes': Disease_nodes,
    'Gene_nodes': Gene_nodes,
    'Molecular_Function_nodes': Molecular_Function_nodes,
    'Pathway_nodes': Pathway_nodes,
    'Pharmacologic_Class_nodes': Pharmacologic_Class_nodes,
    'Side_Effect_nodes': Side_Effect_nodes,
    'Symptom_nodes': Symptom_nodes
}, open(os.path.join(save_dir, 'graph.json'),"w"), indent=4)