In [None]:
import json

with open("gene.json", "rb") as fd:
    gene_data = json.load(fd)

In [1]:
metakg_edges = [
    ("biolink:Gene", "biolink:participates_in", "biolink:BiologicalProcess"),
    ("biolink:Gene", "biolink:has_participant", "biolink:BiologicalProcess"),
    ("biolink:Gene", "biolink:participates_in", "biolink:Pathway"),
]

In [2]:
biolink_map = {
    "biolink:Gene": {
        "path": "",
        "identifier": "entrezgene",
        "id_prefix": "NCBIGene",
        "properties": {
            "symbol": "symbol",
            "name": "name"
        }
    },
    "biolink:BiologicalProcess": {
        "path": "go.BP",
        "identifier": "id",
        "properties": {
            "evidence": "evidence",
            "term": "term"
        }
    },
    "biolink:has_participant": {
        "reverse": True
    },
    "biolink:Pathway": {
        "path": "pathway.reactome",
        "identifier": "id",
        "properties": {
            "name": "name",
        }
    },
}

In [3]:
from jsonpath_ng import parse

def extract_record(_id, datum, attr_dict, record_type, prefix):
    id_val = f"{prefix}:{datum[_id]}" if len(prefix) > 0 else datum[_id]
    record = {"id": id_val, "type": record_type}
    for k, v in attr_dict.items():
        record[k] = datum.get(v)
    return record

def preprocess_data(biolink_label: str, response: dict):
    data_field_map = biolink_map[biolink_label]
    path = data_field_map.get("path", "")
    jsonpath_expr = parse(f"$.{path}") if path else parse("$")

    matches = jsonpath_expr.find(response)
    if len(matches) != 1:
        raise ValueError(f"Matches need to be exactly 1 but are {len(matches)}, please provide a specific path {path}")

    data = matches[0].value
    if isinstance(data, dict) or isinstance(data, list):
        return data, data_field_map
    else:
        raise TypeError(f"Unexpected data type in JSONPath Match {type(data)}")

def get_subject_object_data(biolink_label: str, response: dict):
    try:
        data, data_field_map = preprocess_data(biolink_label, response)
    except ValueError:
        return []
    _id, prefix = data_field_map.get("identifier"), data_field_map.get("id_prefix", "")
    if _id is None:
        raise ValueError(f"Expected 'identifier' field but found None")

    records = []
    if isinstance(data, dict):
        # single record
        records.append(extract_record(_id, data, data_field_map["properties"], biolink_label, prefix))
    elif isinstance(data, list):
        # multiple records
        for datum in data:
            records.append(extract_record(_id, datum, data_field_map["properties"], biolink_label, prefix))

    return records

In [4]:
import biothings_client
from tqdm import tqdm

client = biothings_client.get_client("gene")

In [5]:
import itertools

def generate_records(metakg_edges, biolink_map):
    for gene_data in tqdm(client.query(q="__all__", species="human", fields="all", entrezonly=True, fetch_all=True)):
        for mkg_edge in metakg_edges:
            subjects = get_subject_object_data(mkg_edge[0], gene_data)
        
            predicate = {"type": mkg_edge[1]}
            reverse_edges = False
            if biolink_map.get(mkg_edge[1]):
                reverse_edges = biolink_map.get(mkg_edge[1]).get("reverse", False)
        
            objects = get_subject_object_data(mkg_edge[2], gene_data)
        
            for s, o in itertools.product(subjects, objects):
                if reverse_edges is False:
                    record = { "subject": s, "predicate": predicate, "object": o }
                else:
                    record = { "subject": o, "predicate": predicate, "object": s }
                yield record

In [6]:
records = list(generate_records(metakg_edges, biolink_map))

193279it [1:25:30, 84.80it/s]No more results to return.
193285it [1:25:44, 37.57it/s]


In [None]:
print(json.dumps(records, indent=2))

In [7]:
len(records)

460174

In [9]:
import json

with open("transformed_outputs.json", "w+") as fd:
    fd.write(json.dumps(records))

In [16]:
import random

random.choices(records, k=1000)

[{'subject': {'id': 'GO:0043129',
   'type': 'biolink:BiologicalProcess',
   'evidence': 'IBA',
   'term': 'surfactant homeostasis'},
  'predicate': {'type': 'biolink:has_participant'},
  'object': {'id': 'NCBIGene:4153',
   'type': 'biolink:Gene',
   'symbol': 'MBL2',
   'name': 'mannose binding lectin 2'}},
 {'subject': {'id': 'NCBIGene:7942',
   'type': 'biolink:Gene',
   'symbol': 'TFEB',
   'name': 'transcription factor EB'},
  'predicate': {'type': 'biolink:participates_in'},
  'object': {'id': 'GO:0002250',
   'type': 'biolink:BiologicalProcess',
   'evidence': 'IEA',
   'term': 'adaptive immune response'}},
 {'subject': {'id': 'NCBIGene:6003',
   'type': 'biolink:Gene',
   'symbol': 'RGS13',
   'name': 'regulator of G protein signaling 13'},
  'predicate': {'type': 'biolink:participates_in'},
  'object': {'id': 'R-HSA-388396',
   'type': 'biolink:Pathway',
   'name': 'GPCR downstream signalling'}},
 {'subject': {'id': 'NCBIGene:7364',
   'type': 'biolink:Gene',
   'symbol': 'UG