# Empirical analysis on comparison of EpiGraphDB and ASQ for retrieval of evidence

In [23]:
_input_path = experiment_data_dir / "evidence" / "combined_score.json"
with _input_path.open() as f:
    combined_evidence = json.load(f)
print(len(combined_evidence))

413


In [24]:
_input_path = analysis_dir / "terms_match_epigraphdb.json"
with _input_path.open() as f:
    terms_match_epigraphdb = json.load(f)
print(len(terms_match_epigraphdb))
print(terms_match_epigraphdb[0])

413
{'doi': '10.1101/19001719', 'subject_term': 'Antihypertensive Agents', 'object_term': 'Mood Disorders', 'subject_term_match': {'efo': [], 'gwas': [], 'literature_term': [{'n': {'_name': 'Antihypertensive Agents', 'name': 'Antihypertensive Agents', '_source': ['SemMedDB_VER42_2020_R', 'bioRxiv-2020-10-06', 'medRxiv-2020-10-06'], 'id': 'C0003364', '_id': 'C0003364', 'type': ['phsu']}}]}, 'object_term_match': {'efo': [], 'gwas': [], 'literature_term': []}}


----

# Filter evidence with EpiGraphDB matched entities
filter evidence where the associated subject and object entities 
have been identified by epigraphdb direct search

In [37]:
def _filter_assoc_evidence(evidence_list, subject_term_matches, object_term_matches):
    res = (
        py_.chain(evidence_list)
        .filter(
            lambda e: e["subject_id"] in subject_term_matches
            and e["object_id"] in object_term_matches
        )
        .value()
    )
    return res


def _filter_triple_evidence(evidence_list, subject_term_matches, object_term_matches):
    res = (
        py_.chain(evidence_list)
        .filter(
            lambda e: e["triple_subject_id"] in subject_term_matches
            and e["triple_object_id"] in object_term_matches
        )
        .value()
    )
    return res


def _filter_combined_evidence(terms_match_entry, evidence_entry):
    # gwas terms and assoc evidence
    gwas_subject_term_matches = (
        py_.chain(terms_match_entry["subject_term_match"]["gwas"])
        .map(lambda e: e["n"]["_id"])
        .value()
    )
    gwas_object_term_matches = (
        py_.chain(terms_match_entry["object_term_match"]["gwas"])
        .map(lambda e: e["n"]["_id"])
        .value()
    )
    assoc_evidence = evidence_entry["assoc_evidence"]
    assoc_evidence_filter = {
        _: filter_assoc_evidence(
            assoc_evidence[_], gwas_subject_term_matches, gwas_object_term_matches
        )
        for _ in assoc_evidence.keys()
    }
    # literature terms and literature triple evidence
    lit_subject_term_matches = (
        py_.chain(terms_match_entry["subject_term_match"]["literature_term"])
        .map(lambda e: e["n"]["_id"])
        .value()
    )
    lit_object_term_matches = (
        py_.chain(terms_match_entry["object_term_match"]["literature_term"])
        .map(lambda e: e["n"]["_id"])
        .value()
    )
    triple_evidence = evidence_entry["triple_evidence"]
    triple_evidence_filter = {
        _: filter_triple_evidence(
            triple_evidence[_], lit_subject_term_matches, lit_object_term_matches
        )
        for _ in triple_evidence.keys()
    }
    # combined res
    res = {
        "doi": evidence_entry["doi"],
        "subject_term": evidence_entry["subject_term"],
        "object_term": evidence_entry["object_term"],
        "pred_term": evidence_entry["pred_term"],
        "assoc_evidence": assoc_evidence_filter,
        "triple_evidence": triple_evidence_filter,
    }
    return res


combined_evidence_filter_epigraphdb = (
    py_.chain(range(len(terms_match_epigraphdb)))
    .map(
        lambda idx: _filter_combined_evidence(
            terms_match_entry=terms_match_epigraphdb[idx],
            evidence_entry=combined_evidence[idx],
        )
    )
    .value()
)

In [38]:
_output_path = analysis_dir / "combined_evidence_filter_epigraphdb.json"
with _output_path.open("w") as _f:
    json.dump(combined_evidence_filter_epigraphdb, _f)

----

----

# Filter evidence with ASQ matched entities

In [80]:
_threshold = "default"


def filter_combined_evidence_asq(evidence_entry, threshold):
    # get match_terms
    matched_terms = asq_find_similarity_match(evidence_entry, threshold)
    # print(matched_terms)
    ## gwas_terms
    gwas_subject_term_matches = (
        py_.chain(matched_terms[0]["gwas"]).map(lambda e: e["ent_id"]).value()
    )
    gwas_object_term_matches = (
        py_.chain(matched_terms[1]["gwas"]).map(lambda e: e["ent_id"]).value()
    )
    ## umls_terms
    lit_subject_term_matches = (
        py_.chain(matched_terms[0]["literature_term"])
        .map(lambda e: e["ent_id"])
        .value()
    )
    lit_object_term_matches = (
        py_.chain(matched_terms[1]["literature_term"])
        .map(lambda e: e["ent_id"])
        .value()
    )
    # assoc evidence
    assoc_evidence = evidence_entry["assoc_evidence"]
    assoc_evidence_filter = {
        _: filter_assoc_evidence(
            assoc_evidence[_], gwas_subject_term_matches, gwas_object_term_matches
        )
        for _ in assoc_evidence.keys()
    }
    # literature evidence
    triple_evidence = evidence_entry["triple_evidence"]
    triple_evidence_filter = {
        _: filter_triple_evidence(
            triple_evidence[_], lit_subject_term_matches, lit_object_term_matches
        )
        for _ in triple_evidence.keys()
    }
    # print(gwas_subject_term_matches)
    # combined back
    res = {
        "doi": evidence_entry["doi"],
        "subject_term": evidence_entry["subject_term"],
        "object_term": evidence_entry["object_term"],
        "pred_term": evidence_entry["pred_term"],
        "assoc_evidence": assoc_evidence_filter,
        "triple_evidence": triple_evidence_filter,
    }
    return res


_combined_evidence_filter_asq_step = (
    py_.chain(range(len(combined_evidence)))
    .map(
        lambda idx: filter_combined_evidence_asq(
            evidence_entry=combined_evidence[idx],
            threshold=_threshold,
        )
    )
    .value()
)

print(len(_combined_evidence_filter_asq_step))

_input_dir = analysis_dir / "tmp"
_input_dir.mkdir(exist_ok=True)
_input_path = _input_dir / "asq_threshold_filter.json"
with _input_path.open("w") as f:
    json.dump(_combined_evidence_filter_asq_step, f)

413


In [82]:
combined_evidence_filter_asq = (
    py_.chain(ASQ_THRESHOLDS)
    .map(
        lambda threshold: {
            "threshold": threshold,
            "evidence": py_.chain(range(len(combined_evidence)))
            .map(
                lambda idx: filter_combined_evidence_asq(
                    evidence_entry=combined_evidence[idx],
                    threshold=_threshold,
                )
            )
            .value(),
        }
    )
    .value()
)
print(len(combined_evidence_filter_asq))

_output_path = analysis_dir / "combined_evidence_filter_asq.json"
with _output_path.open("w") as f:
    json.dump(combined_evidence_filter_asq, f)

5


----

In [126]:
combined_evidence_filter = (
    py_.chain(combined_evidence_filter_asq)
    .map(
        lambda e: {
            "filter_type": "asq_{threshold}".format(threshold=e["threshold"]),
            "evidence": e["evidence"],
        }
    )
    .push(
        {"filter_type": "epigraphdb", "evidence": combined_evidence_filter_epigraphdb}
    )
    .value()
)
print(len(combined_evidence_filter))
py_.chain(combined_evidence_filter).map(lambda coll: coll["filter_type"]).value()

6


['asq_0.99', 'asq_0.95', 'asq_0.9', 'asq_0.8', 'asq_default', 'epigraphdb']

In [127]:
_output_path = analysis_dir / "combined_evidence_filter.json"
with _output_path.open("w") as f:
    json.dump(combined_evidence_filter, _f)