# Make case df

In [355]:
key_labels = {
    "supporting": "supporting",
    "contradictory": "reversal",  # this is triple
    "contradictory_directional_type1": "reversal",
    "contradictory_directional_type2": "insufficient",
    "contradictory_undirectional": "insufficient",
    "generic_directional": "additional",
}


def _summarise_evidence(v):
    score = sum([_["evidence_score"] for _ in v])
    count = len(v)
    res = {
        "score": score,
        "count": count,
        "display": f"{score:.2f} ({count})",
    }
    return res


case_df_init = (
    py_.chain(combined_evidence)
    .map(
        lambda e: {
            "triple": e["triple"],
            "subject_term": e["subject_term"],
            "object_term": e["object_term"],
            "pred_term": e["pred_term"],
            "directional": epigraphdb.PRED_DIRECTIONAL_MAPPING[e["pred_term"]],
            "triple_evidence": {
                key_labels[k]: _summarise_evidence(v)
                for k, v in e["triple_evidence"].items()
            },
            "assoc_evidence": {
                key_labels[k]: _summarise_evidence(v)
                for k, v in e["assoc_evidence"].items()
            },
        }
    )
    .uniq_by(lambda e: e["triple"])
    .thru(pd.DataFrame)
    .value()
)

In [359]:
doi_details_file = data_dir / ".." / "medrxiv_abstracts_processed.csv"
assert doi_details_file.exists()

cols = ["doi", "title"]

doi_details = pd.read_csv(doi_details_file)[cols]
print(doi_details.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26846 entries, 0 to 26845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   doi     26846 non-null  object
 1   title   26846 non-null  object
dtypes: object(2)
memory usage: 419.6+ KB
None


In [360]:
doi_df_init = (
    py_.chain(combined_evidence)
    .map(
        lambda e: {
            "triple": e["triple"],
            "doi": e["doi"],
            "context": e["efo_ents"]["context_text"],
        }
    )
    .thru(pd.DataFrame)
    .value()
    .merge(doi_details, on=["doi"])
)
doi_count_df = (
    doi_df_init[["triple", "doi"]]
    .groupby("triple")
    .size()
    .to_frame(name="doi_count")
    .reset_index(drop=False)
)
doi_df_nest = (
    py_.chain(doi_df_init.to_dict(orient="records"))
    .group_by("triple")
    .map(
        lambda v, k: {
            "triple": k,
            "doi": [
                {"doi": _["doi"], "title": _["title"], "context": _["context"]}
                for _ in v
            ],
        }
    )
    .values()
    .thru(pd.DataFrame)
    .value()
)
doi_df = (
    doi_df_nest.merge(doi_count_df, on=["triple"])
    .sort_values(by="doi_count", ascending=False)
    .reset_index(drop=True)
)

In [361]:
case_df = case_df_init.merge(doi_df, on=["triple"]).sort_values(
    by=["doi_count"], ascending=False
)
case_data = case_df.to_dict(orient="records")
print(case_df.info())
case_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 386 entries, 21 to 385
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   triple           386 non-null    object
 1   subject_term     386 non-null    object
 2   object_term      386 non-null    object
 3   pred_term        386 non-null    object
 4   directional      386 non-null    object
 5   triple_evidence  386 non-null    object
 6   assoc_evidence   386 non-null    object
 7   doi              386 non-null    object
 8   doi_count        386 non-null    int64 
dtypes: int64(1), object(8)
memory usage: 30.2+ KB
None


Unnamed: 0,triple,subject_term,object_term,pred_term,directional,triple_evidence,assoc_evidence,doi,doi_count
21,Coronavirus Infections:CAUSES:Disease,Coronavirus Infections,Disease,CAUSES,directional,"{'supporting': {'score': 21.191155450542308, '...","{'supporting': {'score': 0, 'count': 0, 'displ...","[{'doi': '10.1101/2020.03.04.20030395', 'title...",9
202,C-reactive protein:ASSOCIATED_WITH:Depressive ...,C-reactive protein,Depressive disorder,ASSOCIATED_WITH,undirectional,"{'supporting': {'score': 4.771643097467928, 'c...","{'supporting': {'score': 4.440802960006434, 'c...","[{'doi': '10.1101/2020.11.25.20238436', 'title...",2
51,Hepatic impairment:COEXISTS_WITH:Disease,Hepatic impairment,Disease,COEXISTS_WITH,undirectional,"{'supporting': {'score': 27.93201372432616, 'c...","{'supporting': {'score': 0.9296977718583019, '...","[{'doi': '10.1101/2020.04.24.20074179', 'title...",2
60,Malignant Neoplasms:COEXISTS_WITH:Disease,Malignant Neoplasms,Disease,COEXISTS_WITH,undirectional,"{'supporting': {'score': 4.895704378840772, 'c...","{'supporting': {'score': 6.172954240982912, 'c...","[{'doi': '10.1101/2020.05.12.20098996', 'title...",2
69,Percutaneous Coronary Intervention:TREATS:Myoc...,Percutaneous Coronary Intervention,Myocardial Infarction,TREATS,directional,"{'supporting': {'score': 0, 'count': 0, 'displ...","{'supporting': {'score': 21.179385633157793, '...","[{'doi': '10.1101/2020.05.29.20116665', 'title...",2


In [363]:
output_file = analysis_assets_dir / "case_data.json"
with output_file.open("w") as f:
    json.dump(case_data, f)

## flat

In [364]:
case_df_flat = pd.json_normalize(case_data).replace({np.nan: None})
print(case_df_flat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386 entries, 0 to 385
Data columns (total 25 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   triple                               386 non-null    object 
 1   subject_term                         386 non-null    object 
 2   object_term                          386 non-null    object 
 3   pred_term                            386 non-null    object 
 4   directional                          386 non-null    object 
 5   doi                                  386 non-null    object 
 6   doi_count                            386 non-null    int64  
 7   triple_evidence.supporting.score     386 non-null    float64
 8   triple_evidence.supporting.count     386 non-null    int64  
 9   triple_evidence.supporting.display   386 non-null    object 
 10  triple_evidence.reversal.score       157 non-null    object 
 11  triple_evidence.reversal.count  

In [365]:
cols = [
    "triple",
    "subject_term",
    "object_term",
    "pred_term",
    "directional",
    "doi",
    "doi_count",
    # triple
    "triple_evidence.supporting.score",
    "triple_evidence.supporting.count",
    "triple_evidence.reversal.score",
    "triple_evidence.reversal.count",
    # assoc
    "assoc_evidence.supporting.score",
    "assoc_evidence.supporting.count",
    "assoc_evidence.reversal.score",
    "assoc_evidence.reversal.count",
    "assoc_evidence.insufficient.score",
    "assoc_evidence.insufficient.count",
    "assoc_evidence.additional.score",
    "assoc_evidence.additional.count",
]

output_file = analysis_assets_dir / "case_data_flat.json"
output_data = (
    case_df_flat[cols]
    .rename(
        columns={
            # triple
            "triple_evidence.supporting.score": "triple_evidence_supporting_score",
            "triple_evidence.supporting.count": "triple_evidence_supporting_count",
            "triple_evidence.reversal.score": "triple_evidence_reversal_score",
            "triple_evidence.reversal.count": "triple_evidence_reversal_count",
            # assoc
            "assoc_evidence.supporting.score": "assoc_evidence_supporting_score",
            "assoc_evidence.supporting.count": "assoc_evidence_supporting_count",
            "assoc_evidence.reversal.score": "assoc_evidence_reversal_score",
            "assoc_evidence.reversal.count": "assoc_evidence_reversal_count",
            "assoc_evidence.insufficient.score": "assoc_evidence_insufficient_score",
            "assoc_evidence.insufficient.count": "assoc_evidence_insufficient_count",
            "assoc_evidence.additional.score": "assoc_evidence_additional_score",
            "assoc_evidence.additional.count": "assoc_evidence_additional_count",
        }
    )
    .to_dict(orient="records")
)

with output_file.open("w") as f:
    json.dump(output_data, f)

# Display tables

In [161]:
def make_sort_score(row):
    triple_score = row["triple_evidence.supporting.score"]
    assoc_score = row["assoc_evidence.supporting.score"]
    if np.equal(triple_score, 0.0) or np.equal(assoc_score, 0.0):
        if row["doi_count"] > 2:  # hype
            return 0
        return np.nan
    res = (triple_score + assoc_score) / 2
    return res


cols = [
    "directional",
    "triple",
    "doi_count",
    "triple_evidence.supporting.display",
    "triple_evidence.reversal.display",
    "assoc_evidence.supporting.display",
    "assoc_evidence.reversal.display",
    "assoc_evidence.insufficient.display",
    "assoc_evidence.additional.display",
    "sort_score",
]
cols1 = [
    "directional",
    "triple",
    "doi_count",
    "triple_evidence.supporting.display",
    "triple_evidence.reversal.display",
    "assoc_evidence.supporting.display",
    "assoc_evidence.reversal.display",
    "assoc_evidence.insufficient.display",
    "assoc_evidence.additional.display",
]

In [165]:
display_tbl = (
    case_df_flat.assign(sort_score=lambda df: df.apply(make_sort_score, axis=1))
    .dropna(subset=["sort_score"])
    .sort_values(
        by=["directional", "doi_count", "sort_score"], ascending=[True, False, False]
    )
    .groupby("directional")
    # .query("directional == 'directional'")
    .head(10)
    # .sort_values(by=["directional"])
    .reset_index(drop=True)[cols]
)
display_tbl

Unnamed: 0,directional,triple,doi_count,triple_evidence.supporting.display,triple_evidence.reversal.display,assoc_evidence.supporting.display,assoc_evidence.reversal.display,assoc_evidence.insufficient.display,assoc_evidence.additional.display,sort_score
0,directional,Coronavirus Infections:CAUSES:Disease,9,21.19 (17),8.25 (8),0.00 (0),1.65 (2),5.85 (13),0.00 (0),0.0
1,directional,"Blood Glucose:AFFECTS:Diabetes Mellitus, Non-I...",2,3.89 (4),2.49 (3),9.55 (10),9.55 (9),0.54 (3),0.00 (0),6.720228
2,directional,"Diabetes Mellitus, Non-Insulin-Dependent:AFFEC...",2,4.61 (4),5.77 (6),1.97 (2),0.96 (1),59.28 (114),0.00 (0),3.286866
3,directional,Low Back Pain:CAUSES:Chronic pain,1,2.42 (4),2.42 (4),156.09 (222),162.14 (225),389.53 (1056),742.17 (850),79.251282
4,directional,Valvular disease:CAUSES:Heart failure,1,74.78 (74),41.35 (49),32.68 (32),25.39 (24),32.04 (58),8.27 (10),53.729107
5,directional,Metabolic Diseases:CAUSES:Liver diseases,1,50.48 (57),50.10 (58),2.22 (2),0.00 (0),5.79 (13),0.00 (0),26.351514
6,directional,Heart Diseases:CAUSES:Pulmonary Hypertension,1,39.48 (35),34.72 (39),6.20 (8),2.84 (3),48.43 (102),5.89 (8),22.841643
7,directional,Myocardial Infarction:CAUSES:Acute myocardial ...,1,11.67 (11),10.76 (10),31.11 (20),29.84 (20),14.16 (18),7.01 (6),21.389927
8,directional,Ulcerative Colitis:CAUSES:Pouchitis,1,5.60 (7),6.97 (8),36.28 (24),46.13 (31),14.53 (24),0.00 (0),20.943009
9,directional,Myocardial Infarction:AFFECTS:Coronary Circula...,1,11.26 (11),9.59 (8),30.22 (21),64.68 (53),11.44 (22),8.14 (5),20.73741


In [166]:
tbl = display_tbl[cols1]
tbl

Unnamed: 0,directional,triple,doi_count,triple_evidence.supporting.display,triple_evidence.reversal.display,assoc_evidence.supporting.display,assoc_evidence.reversal.display,assoc_evidence.insufficient.display,assoc_evidence.additional.display
0,directional,Coronavirus Infections:CAUSES:Disease,9,21.19 (17),8.25 (8),0.00 (0),1.65 (2),5.85 (13),0.00 (0)
1,directional,"Blood Glucose:AFFECTS:Diabetes Mellitus, Non-I...",2,3.89 (4),2.49 (3),9.55 (10),9.55 (9),0.54 (3),0.00 (0)
2,directional,"Diabetes Mellitus, Non-Insulin-Dependent:AFFEC...",2,4.61 (4),5.77 (6),1.97 (2),0.96 (1),59.28 (114),0.00 (0)
3,directional,Low Back Pain:CAUSES:Chronic pain,1,2.42 (4),2.42 (4),156.09 (222),162.14 (225),389.53 (1056),742.17 (850)
4,directional,Valvular disease:CAUSES:Heart failure,1,74.78 (74),41.35 (49),32.68 (32),25.39 (24),32.04 (58),8.27 (10)
5,directional,Metabolic Diseases:CAUSES:Liver diseases,1,50.48 (57),50.10 (58),2.22 (2),0.00 (0),5.79 (13),0.00 (0)
6,directional,Heart Diseases:CAUSES:Pulmonary Hypertension,1,39.48 (35),34.72 (39),6.20 (8),2.84 (3),48.43 (102),5.89 (8)
7,directional,Myocardial Infarction:CAUSES:Acute myocardial ...,1,11.67 (11),10.76 (10),31.11 (20),29.84 (20),14.16 (18),7.01 (6)
8,directional,Ulcerative Colitis:CAUSES:Pouchitis,1,5.60 (7),6.97 (8),36.28 (24),46.13 (31),14.53 (24),0.00 (0)
9,directional,Myocardial Infarction:AFFECTS:Coronary Circula...,1,11.26 (11),9.59 (8),30.22 (21),64.68 (53),11.44 (22),8.14 (5)


In [167]:
output_file = analysis_assets_dir / "case_tbl.tex"
tbl.style.to_latex(output_file)