# Query EpiGraphDB for retrieval of entities

To investigate the retrieval of information from EpiGraphDB using direct search queries, as a demonstration here we show retrieval of entities from EpiGraphDB directly using the query terms themselves.

In [7]:
# Load the combined evidence data
path_combined_evidence = experiment_data_dir / "evidence" / "combined_score.json"
assert path_combined_evidence.exists(), path_combined_evidence

with path_combined_evidence.open() as f:
    combined_evidence = json.load(f)
print(len(combined_evidence))
print(combined_evidence[0].keys())

413
dict_keys(['doi', 'triple', 'efo_ents', 'umls_ents', 'trait_ents', 'subject_term', 'object_term', 'pred_term', 'triple_evidence', 'assoc_evidence'])


In [8]:
query_templates = {
    "efo": """
        MATCH (n:Efo) where n._name = "{term_name}"
        RETURN n
    """,
    "literature_term": """
        MATCH (n:LiteratureTerm) where n._name = "{term_name}"
        RETURN n
    """,
    "gwas": """
        MATCH (n:Gwas) where n._name = "{term_name}"
        RETURN n
    """,
}


def query_epigraphdb_node(term_name: str, query_template: str) -> List[Dict[str, Any]]:
    url = config.epigraphdb_api_url
    query = query_template.format(term_name=term_name)
    payload = {"query": query}
    res = []
    try:
        r = requests.post("https://api.epigraphdb.org/cypher", json=payload)
        r.raise_for_status()
        res = r.json()["results"]
    except:
        print(f"Error with term_name {term_name}")
    return res


_res = query_epigraphdb_node(
    term_name="Body mass index", query_template=query_templates["gwas"]
)
print(len(_res))
print(_res[0])

12
{'n': {'note': 'Dominance model? If so then not necessarily of value for MR; Results from interim Biobank release enriched for smokers; could lead to bias through collider issues in MR', '_name': 'Body mass index', 'year': '2016.0', 'mr': '0', 'author': 'Wood', 'sex': 'Males and Females', 'pmid': '26961502.0', 'population': 'European', 'sample_size': '120286.0', 'nsnp': '8654252', 'build': 'HG19/GRCh37', 'trait': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'id': 'ieu-a-1089', '_id': 'ieu-a-1089', 'subcategory': 'Anthropometric', 'category': 'Risk factor'}}


In [9]:
def _match_epigraphdb(e) -> Dict[str, Any]:
    subject_term = e["subject_term"]
    object_term = e["object_term"]
    subject_term_match = {
        "efo": query_epigraphdb_node(subject_term, query_templates["efo"]),
        "gwas": query_epigraphdb_node(subject_term, query_templates["gwas"]),
        "literature_term": query_epigraphdb_node(
            subject_term, query_templates["literature_term"]
        ),
    }
    object_term_match = {
        "efo": query_epigraphdb_node(object_term, query_templates["efo"]),
        "gwas": query_epigraphdb_node(object_term, query_templates["gwas"]),
        "literature_term": query_epigraphdb_node(
            object_term, query_templates["literature_term"]
        ),
    }
    res = {
        "doi": e["doi"],
        "subject_term": subject_term,
        "object_term": object_term,
        "subject_term_match": subject_term_match,
        "object_term_match": object_term_match,
    }
    return res


terms_match_epigraphdb = py_.chain(combined_evidence).map(_match_epigraphdb).value()
print(len(terms_match_epigraphdb))
print(terms_match_epigraphdb[0])

413
{'doi': '10.1101/19001719', 'subject_term': 'Antihypertensive Agents', 'object_term': 'Mood Disorders', 'subject_term_match': {'efo': [], 'gwas': [], 'literature_term': [{'n': {'_name': 'Antihypertensive Agents', 'name': 'Antihypertensive Agents', '_source': ['SemMedDB_VER42_2020_R', 'bioRxiv-2020-10-06', 'medRxiv-2020-10-06'], 'id': 'C0003364', '_id': 'C0003364', 'type': ['phsu']}}]}, 'object_term_match': {'efo': [], 'gwas': [], 'literature_term': []}}


In [10]:
_output_path = analysis_dir / "terms_match_epigraphdb.json"
with _output_path.open("w") as _f:
    json.dump(terms_match_epigraphdb, _f)

----

In [12]:
_input_path = analysis_dir / "terms_match_epigraphdb.json"
with _input_path.open() as _f:
    terms_match_epigraphdb = json.load(_f)
print(len(terms_match_epigraphdb))

413


In [13]:
terms_summary_epigraphdb = (
    py_.chain(terms_match_epigraphdb)
    .map(
        lambda e: [
            {
                "term": e["subject_term"],
                "gwas": e["subject_term_match"]["gwas"].__len__() > 0,
                "literature_term": e["subject_term_match"]["literature_term"].__len__()
                > 0,
            },
            {
                "term": e["object_term"],
                "gwas": e["object_term_match"]["gwas"].__len__() > 0,
                "literature_term": e["subject_term_match"]["literature_term"].__len__()
                > 0,
            },
        ]
    )
    .flatten()
    .uniq_by(lambda e: e["term"])
    .map(lambda e: assign(e, {"all": e["gwas"] & e["literature_term"]}))
    .apply(pd.DataFrame)
    .value()
)

print(terms_summary_epigraphdb.__len__())
terms_summary_epigraphdb[0:5]

275


Unnamed: 0,term,gwas,literature_term,all
0,Antihypertensive Agents,False,True,False
1,Mood Disorders,False,True,False
2,Mental disorders,False,False,False
3,Diabetes,False,False,False
4,Urate,True,True,True


In [14]:
terms_count_epigraphdb = (
    terms_summary_epigraphdb.drop(columns=["term"])
    .agg("sum")
    .to_frame()
    .transpose()
    .assign(total=len(terms_summary_epigraphdb), method="epigraphdb_direct_match")
)
terms_count_epigraphdb

Unnamed: 0,gwas,literature_term,all,total,method
0,36,170,26,275,epigraphdb_direct_match


In [16]:
def _check_similarity_match(e: Dict[str, Any], threshold: Union[float, str]):
    def _check(items: Dict[str, Any], threshold: Union[float, str]):
        if isinstance(threshold, str):
            res = len(e) > 0
        else:
            items_pass = [_ for _ in items if _["similarity_score"] >= threshold]
            res = len(items_pass) > 0
        return res

    res = [
        {
            "term": e["subject_term"],
            "gwas": _check(e["trait_ents"]["subject_ents"], threshold),
            "literature_term": _check(e["umls_ents"]["subject_ents"], threshold),
        },
        {
            "term": e["object_term"],
            "gwas": _check(e["trait_ents"]["object_ents"], threshold),
            "literature_term": _check(e["umls_ents"]["object_ents"], threshold),
        },
    ]
    return res


thresholds = [0.99, 0.95, 0.9, 0.8, "default"]
terms_summary_asq = (
    py_.chain(thresholds)
    .map(
        lambda threshold: py_.chain(combined_evidence)
        .map(lambda e: _check_similarity_match(e, threshold))
        .flatten()
        .uniq_by(lambda e: e["term"])
        .map(lambda e: assign(e, {"all": e["gwas"] & e["literature_term"]}))
        .apply(lambda coll: pd.DataFrame(coll).assign(threshold=threshold))
        .value()
    )
    .apply(lambda coll: pd.concat(coll))
    .value()
)
terms_summary_asq.info()
terms_summary_asq

<class 'pandas.core.frame.DataFrame'>
Index: 1375 entries, 0 to 274
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   term             1375 non-null   object
 1   gwas             1375 non-null   bool  
 2   literature_term  1375 non-null   bool  
 3   all              1375 non-null   bool  
 4   threshold        1375 non-null   object
dtypes: bool(3), object(2)
memory usage: 36.3+ KB


Unnamed: 0,term,gwas,literature_term,all,threshold
0,Antihypertensive Agents,False,True,False,0.99
1,Mood Disorders,True,False,False,0.99
2,Mental disorders,False,False,False,0.99
3,Diabetes,False,True,False,0.99
4,Urate,False,True,False,0.99
...,...,...,...,...,...
270,daily energy intakes,True,True,True,default
271,Obsessive compulsive behavior,True,True,True,default
272,Dizziness,True,True,True,default
273,Cardioembolic stroke,True,True,True,default


In [17]:
terms_count_asq = (
    terms_summary_asq.drop(columns=["term"])
    .groupby("threshold")
    .agg("sum")
    .reset_index(drop=False)
    .assign(
        total=len(terms_summary_asq) / len(thresholds),
        method=lambda df: df.apply(
            lambda row: "asq_{threshold}".format(threshold=row["threshold"]), axis=1
        ),
    )
    .drop(columns=["threshold"])
)

In [18]:
terms_count = pd.concat([terms_count_epigraphdb, terms_count_asq])
terms_count

Unnamed: 0,gwas,literature_term,all,total,method
0,36,170,26,275.0,epigraphdb_direct_match
0,193,257,182,275.0,asq_0.8
1,108,201,87,275.0,asq_0.9
2,68,176,51,275.0,asq_0.95
3,49,173,39,275.0,asq_0.99
4,275,275,275,275.0,asq_default


In [19]:
def _gwas_match(e: Dict[str, Any]):
    def _check(items: str) -> List[str]:
        res = [_["ent_term"] for _ in items if _["similarity_score"] >= 0.99]
        return res

    res = [
        {
            "term": e["subject_term"],
            "asq_gwas_terms": _check(e["trait_ents"]["subject_ents"]),
        },
        {
            "term": e["object_term"],
            "asq_gwas_terms": _check(e["trait_ents"]["object_ents"]),
        },
    ]
    return res


asq_099_gwas_terms = (
    py_.chain(combined_evidence)
    .map(_gwas_match)
    .flatten()
    .uniq_by(lambda e: e["term"])
    .apply(pd.DataFrame)
    .value()
)
asq_099_gwas_terms.head()

Unnamed: 0,term,asq_gwas_terms
0,Antihypertensive Agents,[]
1,Mood Disorders,[Mood disorders]
2,Mental disorders,[]
3,Diabetes,[]
4,Urate,[]


In [21]:
_res = (
    terms_summary_epigraphdb[["term", "gwas"]]
    .rename(columns={"gwas": "epigraphdb_match"})
    .merge(
        terms_summary_asq[terms_summary_asq["threshold"] == 0.99][
            ["term", "gwas"]
        ].rename(columns={"gwas": "asq_match"}),
        on=["term"],
    )
    .loc[lambda df: ~df["epigraphdb_match"] & df["asq_match"], :]
    .merge(asq_099_gwas_terms, on=["term"])
)

for idx, row in _res.iterrows():
    print(f"# ---- {idx:02}")
    print(f"term: {row['term']}")
    print(f"asq match: {row['asq_gwas_terms']}")
    print("\n")

# ---- 00
term: Mood Disorders
asq match: ['Mood disorders']


# ---- 01
term: Heart Diseases
asq match: ['Heart failure', 'Coronary heart disease', 'Coronary heart disease', 'Coronary heart disease', 'Coronary heart disease']


# ---- 02
term: Depressive disorder
asq match: ['Mood disorders', 'Major Depressive Disorder', 'Major depressive disorder', 'Major depressive disorder', 'Major Depressive Disorder']


# ---- 03
term: Malignant Glioma
asq match: ['Glioma']


# ---- 04
term: Myocardial Infarction
asq match: ['Myocardial infarction', 'Myocardial infarction']


# ---- 05
term: Alzheimer's Disease
asq match: ["Alzheimer's disease", "Alzheimer's disease", "Alzheimer's disease"]


# ---- 06
term: Bipolar Disorder
asq match: ['Bipolar disorder', 'Bipolar disorder', 'Bipolar disorder']


# ---- 07
term: Arthritis
asq match: ['Rheumatoid arthritis', 'Rheumatoid arthritis', 'Rheumatoid arthritis', 'Rheumatoid arthritis', 'Rheumatoid arthritis']


# ---- 08
term: Hypertensive disease
asq m