# Query EpiGraphDB for retrieval of entities

To investigate the retrieval of information from EpiGraphDB using direct search queries, as a demonstration here we show retrieval of entities from EpiGraphDB directly using the query terms themselves.

In [7]:
# Load the combined evidence data
path_combined_evidence = experiment_data_dir / "evidence" / "combined_score.json"
assert path_combined_evidence.exists(), path_combined_evidence

with path_combined_evidence.open() as f:
    combined_evidence = json.load(f)
print(len(combined_evidence))
print(combined_evidence[0].keys())

413
dict_keys(['doi', 'triple', 'efo_ents', 'umls_ents', 'trait_ents', 'subject_term', 'object_term', 'pred_term', 'triple_evidence', 'assoc_evidence'])


In [8]:
query_templates = {
    "efo": """
        MATCH (n:Efo) where n._name = "{term_name}"
        RETURN n
    """,
    "literature_term": """
        MATCH (n:LiteratureTerm) where n._name = "{term_name}"
        RETURN n
    """,
    "gwas": """
        MATCH (n:Gwas) where n._name = "{term_name}"
        RETURN n
    """,
}


def query_epigraphdb_node(term_name: str, query_template: str) -> List[Dict[str, Any]]:
    url = config.epigraphdb_api_url
    query = query_template.format(term_name=term_name)
    payload = {"query": query}
    res = []
    try:
        r = requests.post("https://api.epigraphdb.org/cypher", json=payload)
        r.raise_for_status()
        res = r.json()["results"]
    except:
        print(f"Error with term_name {term_name}")
    return res


_res = query_epigraphdb_node(
    term_name="Body mass index", query_template=query_templates["gwas"]
)
print(len(_res))
print(_res[0])

12
{'n': {'note': 'Dominance model? If so then not necessarily of value for MR; Results from interim Biobank release enriched for smokers; could lead to bias through collider issues in MR', '_name': 'Body mass index', 'year': '2016.0', 'mr': '0', 'author': 'Wood', 'sex': 'Males and Females', 'pmid': '26961502.0', 'population': 'European', 'sample_size': '120286.0', 'nsnp': '8654252', 'build': 'HG19/GRCh37', 'trait': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'id': 'ieu-a-1089', '_id': 'ieu-a-1089', 'subcategory': 'Anthropometric', 'category': 'Risk factor'}}


In [9]:
def _match_epigraphdb(e) -> Dict[str, Any]:
    subject_term = e["subject_term"]
    object_term = e["object_term"]
    subject_term_match = {
        "efo": query_epigraphdb_node(subject_term, query_templates["efo"]),
        "gwas": query_epigraphdb_node(subject_term, query_templates["gwas"]),
        "literature_term": query_epigraphdb_node(
            subject_term, query_templates["literature_term"]
        ),
    }
    object_term_match = {
        "efo": query_epigraphdb_node(object_term, query_templates["efo"]),
        "gwas": query_epigraphdb_node(object_term, query_templates["gwas"]),
        "literature_term": query_epigraphdb_node(
            object_term, query_templates["literature_term"]
        ),
    }
    res = {
        "doi": e["doi"],
        "subject_term": subject_term,
        "object_term": object_term,
        "subject_term_match": subject_term_match,
        "object_term_match": object_term_match,
    }
    return res


terms_match_epigraphdb = py_.chain(combined_evidence).map(_match_epigraphdb).value()
print(len(terms_match_epigraphdb))
print(terms_match_epigraphdb[0])

413
{'doi': '10.1101/19001719', 'subject_term': 'Antihypertensive Agents', 'object_term': 'Mood Disorders', 'subject_term_match': {'efo': [], 'gwas': [], 'literature_term': [{'n': {'_name': 'Antihypertensive Agents', 'name': 'Antihypertensive Agents', '_source': ['SemMedDB_VER42_2020_R', 'bioRxiv-2020-10-06', 'medRxiv-2020-10-06'], 'id': 'C0003364', '_id': 'C0003364', 'type': ['phsu']}}]}, 'object_term_match': {'efo': [], 'gwas': [], 'literature_term': []}}


In [10]:
_output_path = analysis_dir / "terms_match_epigraphdb.json"
with _output_path.open("w") as _f:
    json.dump(terms_match_epigraphdb, _f)

----

In [12]:
_input_path = analysis_dir / "terms_match_epigraphdb.json"
with _input_path.open() as _f:
    terms_match_epigraphdb = json.load(_f)
print(len(terms_match_epigraphdb))

413


In [13]:
terms_summary_epigraphdb = (
    py_.chain(terms_match_epigraphdb)
    .map(
        lambda e: [
            {
                "term": e["subject_term"],
                "gwas": e["subject_term_match"]["gwas"].__len__() > 0,
                "literature_term": e["subject_term_match"]["literature_term"].__len__()
                > 0,
            },
            {
                "term": e["object_term"],
                "gwas": e["object_term_match"]["gwas"].__len__() > 0,
                "literature_term": e["subject_term_match"]["literature_term"].__len__()
                > 0,
            },
        ]
    )
    .flatten()
    .uniq_by(lambda e: e["term"])
    .map(lambda e: assign(e, {"all": e["gwas"] & e["literature_term"]}))
    .apply(pd.DataFrame)
    .value()
)

print(terms_summary_epigraphdb.__len__())
terms_summary_epigraphdb[0:5]

275


Unnamed: 0,term,gwas,literature_term,all
0,Antihypertensive Agents,False,True,False
1,Mood Disorders,False,True,False
2,Mental disorders,False,False,False
3,Diabetes,False,False,False
4,Urate,True,True,True


In [14]:
terms_count_epigraphdb = (
    terms_summary_epigraphdb.drop(columns=["term"])
    .agg("sum")
    .to_frame()
    .transpose()
    .assign(total=len(terms_summary_epigraphdb), method="epigraphdb_direct_match")
)
terms_count_epigraphdb

Unnamed: 0,gwas,literature_term,all,total,method
0,36,170,26,275,epigraphdb_direct_match
