# Triple parsing

In [11]:
print("number of abstracts", len(triples))
print("number of valid triples", sum(len(_["triples"]) for _ in triples))
print(
    "number of invalid triples",
    sum(len(_["invalid_triples"]) for _ in triples),
)
print(
    "number of abstracts no triples",
    sum((len(_["triples"]) == 0 and len(_["invalid_triples"]) == 0) for _ in triples),
)
print(
    "number of abstracts with valid triples",
    sum(len(_["triples"]) > 0 for _ in triples),
)
print(
    "number of abstracts with only invalid triples",
    sum((len(_["triples"]) == 0 and len(_["invalid_triples"]) > 0) for _ in triples),
)

number of abstracts 26846
number of valid triples 14436
number of invalid triples 54916
number of abstracts no triples 12847
number of abstracts with valid triples 6870
number of abstracts with only invalid triples 7129


# Ent harmonization

## Efo

In [12]:
print("number of abstracts with generated efo ents", len(efo_ents))
print(
    "number of triples with generated efo ents",
    sum([len(_["ents"]) for _ in efo_ents]),
)
print(
    "number of subject efo ents",
    sum([len(__["subject_ents"]) for _ in efo_ents for __ in _["ents"]]),
)
print(
    "number of object efo ents",
    sum([len(__["object_ents"]) for _ in efo_ents for __ in _["ents"]]),
)
efo_doi_triple_pairs = (
    pd.DataFrame(
        [
            {
                "doi": _["doi"],
                "triple": __["triple_text"],
            }
            for _ in efo_ents
            for __ in _["ents"]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
print("Number of DOI-Triple pairs, EFO", len(efo_doi_triple_pairs))

number of abstracts with generated efo ents 1096
number of triples with generated efo ents 1537
number of subject efo ents 2773
number of object efo ents 3921
Number of DOI-Triple pairs, EFO 1446


## UMLS

In [13]:
print("Num dois with umls ents", len(umls_ents))
print("num triples with umls ents", sum([len(_["ents"]) for _ in umls_ents]))
print(
    "number of subject umls ents",
    sum([len(__["subject_ents"]) for _ in umls_ents for __ in _["ents"]]),
)
print(
    "number of object umls ents",
    sum([len(__["object_ents"]) for _ in umls_ents for __ in _["ents"]]),
)
umls_doi_triple_pairs = (
    pd.DataFrame(
        [
            {
                "doi": _["doi"],
                "triple": __["triple_text"],
            }
            for _ in umls_ents
            for __ in _["ents"]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
print("Number of DOI-Triple pairs, umls", len(umls_doi_triple_pairs))
print(
    "Number of DOI-Triple pairs intersect of efo and umls",
    len(
        umls_doi_triple_pairs.merge(
            efo_doi_triple_pairs, how="inner", on=["doi", "triple"]
        )
    ),
)

Num dois with umls ents 1096
num triples with umls ents 1289
number of subject umls ents 61549
number of object umls ents 77757
Number of DOI-Triple pairs, umls 1289
Number of DOI-Triple pairs intersect of efo and umls 1289


## Traits

In [14]:
print("Num dois with trait ents", len(trait_ents))
print("num triples with trait ents", sum([len(_["ents"]) for _ in trait_ents]))
print(
    "number of subject trait ents",
    sum([len(__["subject_ents"]) for _ in trait_ents for __ in _["ents"]]),
)
print(
    "number of object trait ents",
    sum([len(__["object_ents"]) for _ in trait_ents for __ in _["ents"]]),
)
trait_doi_triple_pairs = (
    pd.DataFrame(
        [
            {
                "doi": _["doi"],
                "triple": __["triple_text"],
            }
            for _ in trait_ents
            for __ in _["ents"]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
print("Number of DOI-Triple pairs, traits", len(trait_doi_triple_pairs))
print(
    "Number of DOI-Triple pairs intersect of efo and trait",
    len(
        trait_doi_triple_pairs.merge(
            efo_doi_triple_pairs, how="inner", on=["doi", "triple"]
        )
    ),
)
print(
    "Number of DOI-Triple pairs intersect of efo, umls, and trait",
    len(
        trait_doi_triple_pairs.merge(
            efo_doi_triple_pairs, how="inner", on=["doi", "triple"]
        ).merge(umls_doi_triple_pairs, how="inner", on=["doi", "triple"])
    ),
)

Num dois with trait ents 348
num triples with trait ents 427
number of subject trait ents 4612
number of object trait ents 6469
Number of DOI-Triple pairs, traits 427
Number of DOI-Triple pairs intersect of efo and trait 427
Number of DOI-Triple pairs intersect of efo, umls, and trait 415


# Evidence

In [15]:
evidence_summary_df = pd.DataFrame(
    [
        {
            "doi": _["doi"],
            "triple": _["triple"],
            "pred": _["pred_term"],
            "subject_term": _["subject_term"],
            "object_term": _["object_term"],
            "pred_group": epigraphdb.PRED_DIRECTIONAL_MAPPING[_["pred_term"]],
            "triple_support": len(_["triple_evidence"]["supporting"])
            if "supporting" in _["triple_evidence"].keys()
            else 0,
            "triple_all": sum([len(v) for k, v in _["triple_evidence"].items()]),
            "assoc_support": len(_["assoc_evidence"]["supporting"])
            if "supporting" in _["assoc_evidence"].keys()
            else 0,
            "assoc_all": sum([len(v) for k, v in _["assoc_evidence"].items()]),
        }
        for _ in combined_evidence
    ]
)
evidence_summary_df

Unnamed: 0,doi,triple,pred,subject_term,object_term,pred_group,triple_support,triple_all,assoc_support,assoc_all
0,10.1101/19001719,Antihypertensive Agents:TREATS:Mood Disorders,TREATS,Antihypertensive Agents,Mood Disorders,directional,18,18,0,1
1,10.1101/19006189,Mental disorders:COEXISTS_WITH:Diabetes,COEXISTS_WITH,Mental disorders,Diabetes,undirectional,9,9,3,104
2,10.1101/2019.12.11.19014472,Urate:AFFECTS:Blood Pressure,AFFECTS,Urate,Blood Pressure,directional,0,0,25,303
3,10.1101/2020.01.03.19015602,Heart Diseases:COEXISTS_WITH:Neoplasm Metastasis,COEXISTS_WITH,Heart Diseases,Neoplasm Metastasis,undirectional,2,2,9,152
4,10.1101/2020.01.03.19015602,Disease:COEXISTS_WITH:Left ventricular noncomp...,COEXISTS_WITH,Disease,Left ventricular noncompaction cardiomyopathy,undirectional,10,10,0,5
...,...,...,...,...,...,...,...,...,...,...
408,10.1101/2021.12.13.21267749,"Age:ASSOCIATED_WITH:Diabetes Mellitus, Non-Ins...",ASSOCIATED_WITH,Age,"Diabetes Mellitus, Non-Insulin-Dependent",undirectional,0,0,3,52
409,10.1101/2021.12.17.21267964,"Zinc:TREATS:Diabetes Mellitus, Non-Insulin-Dep...",TREATS,Zinc,"Diabetes Mellitus, Non-Insulin-Dependent",directional,3,3,0,10
410,10.1101/2021.12.21.21267285,"Pregnancy:ASSOCIATED_WITH:Lupus Erythematosus,...",ASSOCIATED_WITH,Pregnancy,"Lupus Erythematosus, Systemic",undirectional,0,0,0,8
411,10.1101/2021.12.23.21268279,Dyspnea:COEXISTS_WITH:Hypertensive disease,COEXISTS_WITH,Dyspnea,Hypertensive disease,undirectional,1,1,0,4


In [16]:
print("Num uniq doi", evidence_summary_df["doi"].drop_duplicates().pipe(len))
print("Num uniq triple", evidence_summary_df["triple"].drop_duplicates().pipe(len))

Num uniq doi 337
Num uniq triple 386


In [17]:
(
    evidence_summary_df[
        ["pred_group", "triple_support", "triple_all", "assoc_support", "assoc_all"]
    ]
    .groupby("pred_group")
    .sum()
)

Unnamed: 0_level_0,triple_support,triple_all,assoc_support,assoc_all
pred_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
directional,1924,3169,867,9157
undirectional,2035,2035,5553,18314


In [18]:
items = (
    py_.chain(evidence_summary_df.to_dict(orient="records"))
    .map(lambda e: [e["subject_term"], e["object_term"]])
    .flatten()
    .uniq()
    .value()
)
print(items[:5])
print("Num uniq terms")
print(len(items))

['Antihypertensive Agents', 'Mood Disorders', 'Mental disorders', 'Diabetes', 'Urate']
Num uniq terms
275


In [19]:
combined_summary = evidence_df[
    ["pred_term", "assoc_evidence", "triple_evidence"]
].assign(
    evidence_count=lambda df: df.apply(
        lambda row: pd.DataFrame(
            [
                {
                    "evidence_type": k,
                    "evidence_count": len(v),
                    "evidence_score": sum([_["evidence_score"] for _ in v]),
                    "pred_term": row["pred_term"],
                    "evidence_group": "assoc",
                }
                for k, v in row["assoc_evidence"].items()
            ]
            + [
                {
                    "evidence_type": k,
                    "evidence_count": len(v),
                    "evidence_score": sum([_["evidence_score"] for _ in v]),
                    "pred_term": row["pred_term"],
                    "evidence_group": "triple",
                }
                for k, v in row["triple_evidence"].items()
            ]
        ),
        axis=1,
    )
)
evidence_count_df = pd.concat(combined_summary["evidence_count"].tolist()).assign(
    pred_group=lambda df: df["pred_term"].apply(
        lambda x: epigraphdb.PRED_DIRECTIONAL_MAPPING[x]
    )
)
combined_summary_df = (
    evidence_count_df.groupby(
        ["pred_group", "pred_term", "evidence_group", "evidence_type"]
    )
    .sum()
    .assign(avg_evidence_score=lambda df: df["evidence_score"] / df["evidence_count"])
)
with pd.option_context("display.width", 300):
    print(combined_summary_df)

                                                                              evidence_count  evidence_score  avg_evidence_score
pred_group    pred_term       evidence_group evidence_type                                                                      
directional   AFFECTS         assoc          contradictory_directional_type1             343      382.348749            1.114719
                                             contradictory_directional_type2            3139     1532.544364            0.488227
                                             generic_directional                         289      183.410770            0.634639
                                             supporting                                  372      387.397418            1.041391
                              triple         contradictory                               291      285.214016            0.980117
                                             supporting                                  375     

## Distro of doi-triples across evidence types

In [20]:
profile_summary = (
    pd.DataFrame(
        evidence_df[["pred_term", "assoc_evidence", "triple_evidence"]]
        .assign(
            evidence_profile=lambda df: df.apply(
                lambda row: {
                    "assoc_evidence_profile": str(
                        [k for k, v in row["assoc_evidence"].items() if len(v) > 0]
                    ),
                    "triple_evidence_profile": str(
                        [k for k, v in row["triple_evidence"].items() if len(v) > 0]
                    ),
                    "pred_term": row["pred_term"],
                    "pred_group": epigraphdb.PRED_DIRECTIONAL_MAPPING[row["pred_term"]],
                },
                axis=1,
            )
        )["evidence_profile"]
        .tolist()
    )
    .groupby(
        [
            "pred_group",
            "pred_term",
            "assoc_evidence_profile",
            "triple_evidence_profile",
        ]
    )
    .size()
    .to_frame(name="count")
    .sort_values("count", ascending=False)
)
with pd.option_context("display.width", 300, "display.max_colwidth", 120):
    print(
        profile_summary.groupby(["pred_term", "pred_group"])
        .head(15)
        .sort_values(["pred_group", "pred_term", "count"], ascending=False)
    )

                                                                                                                                                                           count
pred_group    pred_term       assoc_evidence_profile                                                                                      triple_evidence_profile               
undirectional INTERACTS_WITH  ['contradictory_undirectional']                                                                             ['supporting']                       2
                              ['supporting', 'contradictory_undirectional']                                                               []                                   2
              COEXISTS_WITH   ['supporting', 'contradictory_undirectional']                                                               ['supporting']                      92
                              ['contradictory_undirectional']                                                      

## doi-triples distro

In [21]:
print("\nAfter claim parsing ")
doi_triples = pd.DataFrame(
    py_.flatten(
        [
            {
                "doi": _["doi"],
                "triple_text": __["triple_text"],
                "sub_term": __["sub_term"],
                "pred_term": __["pred"],
                "obj_term": __["obj_term"],
            }
            for _ in triples
            for __ in _["triples"]
            if len(_["triples"]) > 0
        ]
    )
).drop_duplicates()
doi_triple_summary = (
    doi_triples[["pred_term"]]
    .assign(
        pred_group=lambda df: df["pred_term"].apply(
            lambda x: epigraphdb.PRED_DIRECTIONAL_MAPPING[x]
        )
    )
    .groupby(["pred_group", "pred_term"])
    .size()
    .to_frame(name="count_doi_triple")
    .sort_values(["pred_group", "pred_term"])
)
print(doi_triple_summary)

print("\nAfter ents normalization, efo")
doi_triple_efo = pd.DataFrame(
    py_.flatten(
        [
            {
                "doi": _["doi"],
                "triple_text": __["triple_text"],
                "pred_term": __["pred_term"],
            }
            for _ in efo_ents
            for __ in _["ents"]
            if len(_["ents"]) > 0
        ]
    )
).drop_duplicates()
doi_triple_summary_efo = (
    doi_triple_efo[["pred_term"]]
    .assign(
        pred_group=lambda df: df["pred_term"].apply(
            lambda x: epigraphdb.PRED_DIRECTIONAL_MAPPING[x]
        )
    )
    .groupby(["pred_group", "pred_term"])
    .size()
    .to_frame(name="count_doi_triple")
    .sort_values(["pred_group", "pred_term"])
)
print(doi_triple_summary_efo)

print("\nAfter ents normalization, umls and traits")
doi_triple_summary_post_ent_norm = (
    combined_ents[["pred_term"]]
    .assign(
        pred_group=lambda df: df["pred_term"].apply(
            lambda x: epigraphdb.PRED_DIRECTIONAL_MAPPING[x]
        )
    )
    .groupby(["pred_group", "pred_term"])
    .size()
    .to_frame(name="count_doi_triple")
    .sort_values(["pred_group", "pred_term"])
)
print(doi_triple_summary_post_ent_norm)

print("\nAfter evidence retrieval")
doi_triple_summary_post_evidence = (
    evidence_df[["pred_term"]]
    .assign(
        pred_group=lambda df: df["pred_term"].apply(
            lambda x: epigraphdb.PRED_DIRECTIONAL_MAPPING[x]
        )
    )
    .groupby(["pred_group", "pred_term"])
    .size()
    .to_frame(name="count_doi_triple")
    .sort_values(["pred_group", "pred_term"])
)
print(doi_triple_summary_post_evidence)


After claim parsing 
                               count_doi_triple
pred_group    pred_term                        
directional   AFFECTS                      2392
              CAUSES                       1054
              TREATS                       5164
undirectional ASSOCIATED_WITH              1554
              COEXISTS_WITH                2553
              INTERACTS_WITH                578

After ents normalization, efo
                               count_doi_triple
pred_group    pred_term                        
directional   AFFECTS                       233
              CAUSES                        224
              TREATS                        253
undirectional ASSOCIATED_WITH               221
              COEXISTS_WITH                 461
              INTERACTS_WITH                 54

After ents normalization, umls and traits
                               count_doi_triple
pred_group    pred_term                        
directional   AFFECTS                   