In [1]:
%load_ext autoreload
%load_ext lab_black
%autoreload 2

In [2]:
import sys

sys.path.append("..")

---

In [21]:
import json

import pandas as pd
import networkx as nx

from funcs.utils import find_project_root

In [10]:
root = find_project_root()
data_dir = root.parent / "data"
efo_dir = data_dir / "efo"
assert efo_dir.exists()

efo_file = efo_dir / "efo-v3.29.1.json"
assert efo_file.exists()

In [95]:
output_efo_graph = efo_dir / "efo_graph.gpickle"
output_efo_nodes = efo_dir / "efo_nodes_clean.csv"

In [16]:
with efo_file.open() as f:
    efo_data = json.load(f)

In [20]:
efo_nodes_raw = efo_data["graphs"][0]["nodes"]
efo_edges_raw = efo_data["graphs"][0]["edges"]

In [68]:
def _extract_record(rec):
    keys = rec.keys()
    id = rec["id"] if "id" in keys else None
    lbl = rec["lbl"] if "lbl" in keys else None
    rec_type = rec["type"] if "type" in keys else None
    res = {"id": id, "lbl": lbl, "type": rec_type}
    return res


efo_nodes = (
    pd.DataFrame.from_records([_extract_record(_) for _ in efo_nodes_raw])
    .dropna()
    .drop_duplicates(["lbl"])
    .query("type == 'CLASS'")
    .drop(columns=["type"])
    .reset_index(drop=True)
)
print(efo_nodes.info())
efo_nodes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28402 entries, 0 to 28401
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      28402 non-null  object
 1   lbl     28402 non-null  object
dtypes: object(2)
memory usage: 443.9+ KB
None


Unnamed: 0,id,lbl
0,http://www.orpha.net/ORDO/Orphanet_199315,Familial clubfoot with or without associated l...
1,http://purl.obolibrary.org/obo/MONDO_0002974,cervical cancer
2,http://www.orpha.net/ORDO/Orphanet_209203,Qualitative or quantitative defects of glucosa...
3,http://www.orpha.net/ORDO/Orphanet_199318,15q13.3 microdeletion syndrome
4,http://purl.obolibrary.org/obo/MONDO_0002977,autoimmune disease of the nervous system
...,...,...
28397,http://www.orpha.net/ORDO/Orphanet_289548,Inherited isolated adrenal insufficiency due t...
28398,http://purl.obolibrary.org/obo/NCBITaxon_8128,Oreochromis niloticus
28399,http://purl.obolibrary.org/obo/NCBITaxon_44397,Melospiza melodia
28400,http://purl.obolibrary.org/obo/UBERON_0002137,aortic valve


In [72]:
# source -[child_of]-> target
efo_edges = (
    pd.DataFrame(efo_edges_raw)
    .query("pred == 'is_a'")
    .drop(columns=["pred"])
    .rename(columns={"sub": "source", "obj": "target"})
)
print(efo_edges.info())
efo_edges

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45884 entries, 0 to 53335
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  45884 non-null  object
 1   target  45884 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB
None


Unnamed: 0,source,target
0,http://www.orpha.net/ORDO/Orphanet_168606,http://purl.obolibrary.org/obo/MONDO_0024255
1,http://www.ebi.ac.uk/efo/EFO_1000407,http://purl.obolibrary.org/obo/MONDO_0002722
3,http://purl.obolibrary.org/obo/MONDO_0016629,http://purl.obolibrary.org/obo/MONDO_0002243
4,http://purl.obolibrary.org/obo/NCBITaxon_2173,http://purl.obolibrary.org/obo/NCBITaxon_2157
5,http://www.ebi.ac.uk/efo/EFO_0002097,http://www.ebi.ac.uk/efo/EFO_0002888
...,...,...
53330,http://www.orpha.net/ORDO/Orphanet_238459,http://www.orpha.net/ORDO/Orphanet_371047
53332,http://www.ebi.ac.uk/efo/EFO_0008118,http://www.ebi.ac.uk/efo/EFO_0007937
53333,http://www.ebi.ac.uk/efo/EFO_0008766,http://www.ebi.ac.uk/efo/EFO_0001457
53334,http://purl.obolibrary.org/obo/CLO_0014025,http://www.ebi.ac.uk/efo/EFO_0001640


In [91]:
efo_graph_edgelist = (
    efo_edges.merge(
        efo_nodes.rename(columns={"id": "source", "lbl": "source_label"}),
        how="inner",
        left_on="source",
        right_on="source",
    )
    .merge(
        efo_nodes.rename(columns={"id": "target", "lbl": "target_label"}),
        how="inner",
        left_on="target",
        right_on="target",
    )
    .rename(columns={"source": "source_id", "target": "target_id"})
)
efo_nodes_dict = (
    efo_nodes.rename(columns={"lbl": "label", "id": "efo_id"})
    .set_index("label")
    .to_dict(orient="index")
)

efo_graph = nx.from_pandas_edgelist(
    efo_graph_edgelist,
    source="source_label",
    target="target_label",
    create_using=nx.DiGraph,
)
nx.set_node_attributes(efo_graph, efo_nodes_dict)

In [92]:
efo_graph["body mass index"]

AtlasView({'body weights and measures': {}})

In [93]:
efo_graph.nodes["body mass index"]

{'efo_id': 'http://www.ebi.ac.uk/efo/EFO_0004340'}

In [94]:
nx.shortest_path_length(
    efo_graph.to_undirected(), "body mass index", "coronary artery disease",
)

11

---

In [96]:
nx.write_gpickle(efo_graph, output_efo_graph)

In [97]:
efo_nodes.to_csv(output_efo_nodes, index=False)