# Query EpiGraphDB for retrieval of information

To investigate the retrieval of information from EpiGraphDB using direct search queries, as a demonstration here we show retrieval of entities from EpiGraphDB directly using the query terms themselves.

In [124]:
# Load the combined ents data
path_combined_ents = experiment_data_dir / "ents" / "combined_ents.json"
assert path_combined_ents.exists(), path_combined_ents

with path_combined_ents.open() as f:
    combined_ents = json.load(f)
print(len(combined_ents))

415


In [125]:
query_terms = (
    py_.chain(combined_ents)
    .map(
        lambda e: {
            "doi": e["doi"],
            "subject_term": e["subject_term"],
            "object_term": e["object_term"],
        }
    )
    .value()
)
print(query_terms[:5])

[{'doi': '10.1101/19001719', 'subject_term': 'Antihypertensive Agents', 'object_term': 'Mood Disorders'}, {'doi': '10.1101/19006189', 'subject_term': 'Mental disorders', 'object_term': 'Diabetes'}, {'doi': '10.1101/2019.12.11.19014472', 'subject_term': 'Urate', 'object_term': 'Blood Pressure'}, {'doi': '10.1101/2020.01.03.19015602', 'subject_term': 'Heart Diseases', 'object_term': 'Neoplasm Metastasis'}, {'doi': '10.1101/2020.01.03.19015602', 'subject_term': 'Disease', 'object_term': 'Left ventricular noncompaction cardiomyopathy'}]


In [33]:
query_templates = {
    "efo": """
        MATCH (n:Efo) where n._name = "{term_name}"
        RETURN n
    """,
    "literature_term": """
        MATCH (n:LiteratureTerm) where n._name = "{term_name}"
        RETURN n
    """,
    "gwas": """
        MATCH (n:Gwas) where n._name = "{term_name}"
        RETURN n
    """,
}


def query_epigraphdb_node(term_name: str, query_template: str) -> List[Dict[str, Any]]:
    url = config.epigraphdb_api_url
    query = query_template.format(term_name=term_name)
    payload = {"query": query}
    res = []
    try:
        r = requests.post("https://api.epigraphdb.org/cypher", json=payload)
        r.raise_for_status()
        res = r.json()["results"]
    except:
        print(f"Error with term_name {term_name}")
    return res


_res = query_epigraphdb_node(
    term_name="Body mass index", query_template=query_templates["gwas"]
)
print(len(_res))
print(_res[0])

12
{'n': {'note': 'Dominance model? If so then not necessarily of value for MR; Results from interim Biobank release enriched for smokers; could lead to bias through collider issues in MR', '_name': 'Body mass index', 'year': '2016.0', 'mr': '0', 'author': 'Wood', 'sex': 'Males and Females', 'pmid': '26961502.0', 'population': 'European', 'sample_size': '120286.0', 'nsnp': '8654252', 'build': 'HG19/GRCh37', 'trait': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'id': 'ieu-a-1089', '_id': 'ieu-a-1089', 'subcategory': 'Anthropometric', 'category': 'Risk factor'}}


In [29]:
query_epigraphdb_node("Body mass index", query_templates["efo"])

[]

In [34]:
def _match_epigraphdb(e) -> Dict[str, Any]:
    subject_term = e["subject_term"]
    object_term = e["object_term"]
    subject_term_match = {
        "efo": query_epigraphdb_node(subject_term, query_templates["efo"]),
        "gwas": query_epigraphdb_node(subject_term, query_templates["gwas"]),
        "literature_term": query_epigraphdb_node(
            subject_term, query_templates["literature_term"]
        ),
    }
    object_term_match = {
        "efo": query_epigraphdb_node(object_term, query_templates["efo"]),
        "gwas": query_epigraphdb_node(object_term, query_templates["gwas"]),
        "literature_term": query_epigraphdb_node(
            object_term, query_templates["literature_term"]
        ),
    }
    res = {
        "doi": e["doi"],
        "subject_term": subject_term,
        "object_term": object_term,
        "subject_term_match": subject_term_match,
        "object_term_match": object_term_match,
    }
    return res


terms_match = py_.chain(query_terms).map(_match_epigraphdb).value()

In [42]:
terms_match[0]

{'doi': '10.1101/19001719',
 'subject_term': 'Antihypertensive Agents',
 'object_term': 'Mood Disorders',
 'subject_term_match': {'efo': [],
  'gwas': [],
  'literature_term': [{'n': {'_name': 'Antihypertensive Agents',
     'name': 'Antihypertensive Agents',
     '_source': ['SemMedDB_VER42_2020_R',
      'bioRxiv-2020-10-06',
      'medRxiv-2020-10-06'],
     'id': 'C0003364',
     '_id': 'C0003364',
     'type': ['phsu']}}]},
 'object_term_match': {'efo': [], 'gwas': [], 'literature_term': []}}

In [43]:
output_path = analysis_dir / "query_term_match_to_epigraphdb.json"
with output_path.open("w") as f:
    json.dump(terms_match, f)

--------

In [126]:
input_path = analysis_dir / "query_term_match_to_epigraphdb.json"
with input_path.open() as f:
    terms_match = json.load(f)

In [127]:
terms_match_summary = (
    py_.chain(terms_match)
    .map(
        lambda e: [
            {
                "term": e["subject_term"],
                "gwas": e["subject_term_match"]["gwas"].__len__() > 0,
                "literature_term": e["subject_term_match"]["literature_term"].__len__()
                > 0,
            },
            {
                "term": e["object_term"],
                "gwas": e["object_term_match"]["gwas"].__len__() > 0,
                "literature_term": e["subject_term_match"]["literature_term"].__len__()
                > 0,
            },
        ]
    )
    .flatten()
    .uniq_by(lambda e: e["term"])
    .map(lambda e: assign(e, {"all": e["gwas"] & e["literature_term"]}))
    .value()
)

print(terms_match_summary.__len__())
print(terms_match_summary[0:5])

275
[{'term': 'Antihypertensive Agents', 'gwas': False, 'literature_term': True, 'all': False}, {'term': 'Mood Disorders', 'gwas': False, 'literature_term': True, 'all': False}, {'term': 'Mental disorders', 'gwas': False, 'literature_term': False, 'all': False}, {'term': 'Diabetes', 'gwas': False, 'literature_term': False, 'all': False}, {'term': 'Urate', 'gwas': True, 'literature_term': True, 'all': True}]


In [128]:
epigraphdb_match = pd.DataFrame(terms_match_summary)
epigraphdb_match

Unnamed: 0,term,gwas,literature_term,all
0,Antihypertensive Agents,False,True,False
1,Mood Disorders,False,True,False
2,Mental disorders,False,False,False
3,Diabetes,False,False,False
4,Urate,True,True,True
...,...,...,...,...
270,daily energy intakes,False,True,False
271,Obsessive compulsive behavior,False,False,False
272,Dizziness,False,True,False
273,Cardioembolic stroke,True,True,True


In [129]:
epigraphdb_match_summary = (
    epigraphdb_match.drop(columns=["term"])
    .agg("sum")
    .to_frame()
    .transpose()
    .assign(total=len(df_epigraphdb_match), method="epigraphdb_direct_match")
)
epigraphdb_match_summary

Unnamed: 0,gwas,literature_term,all,total,method
0,36,170,26,275,epigraphdb_direct_match


--------

In [130]:
def _check_similarity_match(e: Dict[str, Any], threshold: Union[float, str]):
    def _check(items: Dict[str, Any], threshold: Union[float, str]):
        if isinstance(threshold, str):
            res = len(e) > 0
        else:
            items_pass = [_ for _ in items if _["similarity_score"] >= threshold]
            res = len(items_pass) > 0
        return res

    res = [
        {
            "term": e["subject_term"],
            "gwas": _check(e["trait_ents"]["subject_ents"], threshold),
            "literature_term": _check(e["umls_ents"]["subject_ents"], threshold),
        },
        {
            "term": e["object_term"],
            "gwas": _check(e["trait_ents"]["object_ents"], threshold),
            "literature_term": _check(e["umls_ents"]["object_ents"], threshold),
        },
    ]
    return res


thresholds = [0.99, 0.95, 0.9, 0.8, "default"]
asq_match = (
    py_.chain(thresholds)
    .map(
        lambda threshold: py_.chain(combined_ents)
        .map(lambda e: _check_similarity_match(e, threshold))
        .flatten()
        .uniq_by(lambda e: e["term"])
        .map(lambda e: assign(e, {"all": e["gwas"] & e["literature_term"]}))
        .apply(lambda coll: pd.DataFrame(coll).assign(threshold=threshold))
        .value()
    )
    .apply(lambda coll: pd.concat(coll))
    .value()
)

In [131]:
asq_match

Unnamed: 0,term,gwas,literature_term,all,threshold
0,Antihypertensive Agents,False,True,False,0.99
1,Mood Disorders,True,False,False,0.99
2,Mental disorders,False,False,False,0.99
3,Diabetes,False,True,False,0.99
4,Urate,False,True,False,0.99
...,...,...,...,...,...
270,daily energy intakes,True,True,True,default
271,Obsessive compulsive behavior,True,True,True,default
272,Dizziness,True,True,True,default
273,Cardioembolic stroke,True,True,True,default


In [132]:
asq_match_summary = (
    asq_match.drop(columns=["term"])
    .groupby("threshold")
    .agg("sum")
    .reset_index(drop=False)
    .assign(
        total=len(asq_match) / len(thresholds),
        method=lambda df: df.apply(
            lambda row: "asq_{threshold}".format(threshold=row["threshold"]), axis=1
        ),
    )
    .drop(columns=["threshold"])
)

In [133]:
match_summary = pd.concat([epigraphdb_match_summary, asq_match_summary])
match_summary

Unnamed: 0,gwas,literature_term,all,total,method
0,36,170,26,275.0,epigraphdb_direct_match
0,193,257,182,275.0,asq_0.8
1,108,201,87,275.0,asq_0.9
2,68,176,51,275.0,asq_0.95
3,49,173,39,275.0,asq_0.99
4,275,275,275,275.0,asq_default


--------

In [None]:
# cases where direct match missed but 0.95 fetched, using gwas as example

In [136]:
def _gwas_match(e: Dict[str, Any]):
    def _check(items: str) -> List[str]:
        res = [_["ent_term"] for _ in items if _["similarity_score"] >= 0.99]
        return res

    res = [
        {
            "term": e["subject_term"],
            "asq_gwas_terms": _check(e["trait_ents"]["subject_ents"]),
        },
        {
            "term": e["object_term"],
            "asq_gwas_terms": _check(e["trait_ents"]["object_ents"]),
        },
    ]
    return res


asq_099_gwas_terms = (
    py_.chain(combined_ents)
    .map(_gwas_match)
    .flatten()
    .uniq_by(lambda e: e["term"])
    .apply(pd.DataFrame)
    .value()
)
asq_099_gwas_terms.head()

Unnamed: 0,term,asq_gwas_terms
0,Antihypertensive Agents,[]
1,Mood Disorders,[Mood disorders]
2,Mental disorders,[]
3,Diabetes,[]
4,Urate,[]


In [142]:
_res = (
    epigraphdb_match[["term", "gwas"]]
    .rename(columns={"gwas": "epigraphdb_match"})
    .merge(
        asq_match[asq_match["threshold"] == 0.99][["term", "gwas"]].rename(
            columns={"gwas": "asq_match"}
        ),
        on=["term"],
    )
    .loc[lambda df: ~df["epigraphdb_match"] & df["asq_match"], :]
    .merge(asq_099_gwas_terms, on=["term"])
)

for idx, row in _res.iterrows():
    print(f"# ---- {idx:02}")
    print(f"term: {row['term']}")
    print(f"asq match: {row['asq_gwas_terms']}")
    print("\n")

# ---- 00
term: Mood Disorders
asq match: ['Mood disorders']


# ---- 01
term: Heart Diseases
asq match: ['Heart failure', 'Coronary heart disease', 'Coronary heart disease', 'Coronary heart disease', 'Coronary heart disease']


# ---- 02
term: Depressive disorder
asq match: ['Mood disorders', 'Major Depressive Disorder', 'Major depressive disorder', 'Major depressive disorder', 'Major Depressive Disorder']


# ---- 03
term: Malignant Glioma
asq match: ['Glioma']


# ---- 04
term: Myocardial Infarction
asq match: ['Myocardial infarction', 'Myocardial infarction']


# ---- 05
term: Alzheimer's Disease
asq match: ["Alzheimer's disease", "Alzheimer's disease", "Alzheimer's disease"]


# ---- 06
term: Bipolar Disorder
asq match: ['Bipolar disorder', 'Bipolar disorder', 'Bipolar disorder']


# ---- 07
term: Arthritis
asq match: ['Rheumatoid arthritis', 'Rheumatoid arthritis', 'Rheumatoid arthritis', 'Rheumatoid arthritis', 'Rheumatoid arthritis']


# ---- 08
term: Hypertensive disease
asq m