# Cases

# Init

In [53]:
import networkx as nx
from pyvis.network import Network

# Cases meta overview

In [207]:
from typing import List


def make_word_freq(terms: List[str]) -> pd.DataFrame:
    word_freq = {}
    corpus = (
        py_.chain(terms).map(lambda e: {"term": e, "token": e.strip().lower()}).value()
    )
    tokens = py_.chain(corpus).map(lambda e: e["token"]).uniq().value()
    for item in corpus:
        token = item["token"]
        term = item["term"]
        if token not in word_freq.keys():
            word_freq[token] = {
                "token": token,
                "terms": {term},
                "count": 1,
            }
        else:
            word_freq[token]["count"] += 1
            word_freq[token]["terms"] = word_freq[token]["terms"].union({term})
    word_freq_df = (
        pd.DataFrame.from_dict(word_freq, orient="index")
        .sort_values(by=["count"], ascending=False)
        .assign(total_num_tokens=len(tokens))
        .assign(term=lambda df: df["terms"].apply(lambda e: list(e)))
        .explode("term")
    )
    return word_freq_df


df = make_word_freq(terms=["Apple", "Apple ", "apple", "Orange"])
df

Unnamed: 0,token,terms,count,total_num_tokens,term
apple,apple,"{Apple , Apple, apple}",3,2,Apple
apple,apple,"{Apple , Apple, apple}",3,2,Apple
apple,apple,"{Apple , Apple, apple}",3,2,apple
orange,orange,{Orange},1,2,Orange


In [208]:
# terms count all cases
terms_count_all = (
    py_.chain(combined_evidence)
    .map(lambda e: [e["subject_term"], e["object_term"]])
    .flatten()
    .thru(lambda o: make_word_freq(o))
    .value()
)
terms_count_all.head(20)

Unnamed: 0,token,terms,count,total_num_tokens,term
disease,disease,{Disease},77,275,Disease
obesity,obesity,{Obesity},30,275,Obesity
depressive disorder,depressive disorder,{Depressive disorder},26,275,Depressive disorder
coronavirus infections,coronavirus infections,{Coronavirus Infections},21,275,Coronavirus Infections
diabetes,diabetes,{Diabetes},20,275,Diabetes
malignant neoplasms,malignant neoplasms,{Malignant Neoplasms},19,275,Malignant Neoplasms
"diabetes mellitus, non-insulin-dependent","diabetes mellitus, non-insulin-dependent","{Diabetes Mellitus, Non-Insulin-Dependent}",15,275,"Diabetes Mellitus, Non-Insulin-Dependent"
hypertensive disease,hypertensive disease,{Hypertensive disease},13,275,Hypertensive disease
parkinson disease,parkinson disease,{Parkinson Disease},13,275,Parkinson Disease
schizophrenia,schizophrenia,{Schizophrenia},11,275,Schizophrenia


In [209]:
# term count, with both triple support and evidence support
def _has_supporting_evidence(e):
    triple = (
        "supporting" in e["triple_evidence"].keys()
        and len(e["triple_evidence"]["supporting"]) > 0
    )
    assoc = (
        "supporting" in e["assoc_evidence"].keys()
        and len(e["assoc_evidence"]["supporting"]) > 0
    )
    return triple and assoc


terms_count_support = (
    py_.chain(combined_evidence)
    .filter(_has_supporting_evidence)
    .map(lambda e: [e["subject_term"], e["object_term"]])
    .flatten()
    .thru(lambda o: make_word_freq(o))
    .value()
)
terms_count_support.head(20)

Unnamed: 0,token,terms,count,total_num_tokens,term
disease,disease,{Disease},41,144,Disease
obesity,obesity,{Obesity},20,144,Obesity
diabetes,diabetes,{Diabetes},17,144,Diabetes
depressive disorder,depressive disorder,{Depressive disorder},14,144,Depressive disorder
parkinson disease,parkinson disease,{Parkinson Disease},13,144,Parkinson Disease
"diabetes mellitus, non-insulin-dependent","diabetes mellitus, non-insulin-dependent","{Diabetes Mellitus, Non-Insulin-Dependent}",10,144,"Diabetes Mellitus, Non-Insulin-Dependent"
alzheimer's disease,alzheimer's disease,{Alzheimer's Disease},8,144,Alzheimer's Disease
schizophrenia,schizophrenia,{Schizophrenia},8,144,Schizophrenia
c-reactive protein,c-reactive protein,{C-reactive protein},7,144,C-reactive protein
malignant neoplasms,malignant neoplasms,{Malignant Neoplasms},7,144,Malignant Neoplasms


In [214]:
terms_count_init = (
    py_.chain(triples)
    .filter(lambda e: len(e["triples"]) > 0)
    .map(lambda e: [[_["sub_term"], _["obj_term"]] for _ in e["triples"]])
    .flatten_deep()
    .thru(lambda o: make_word_freq(o))
    .value()
)
terms_count_init

Unnamed: 0,token,terms,count,total_num_tokens,term
patients,patients,{Patients},1896,5564,Patients
disease,disease,{Disease},715,5564,Disease
vaccines,vaccines,{Vaccines},328,5564,Vaccines
persons,persons,{Persons},315,5564,Persons
symptoms,symptoms,{Symptoms},279,5564,Symptoms
...,...,...,...,...,...
galr1 gene,galr1 gene,{GALR1 gene},1,5564,GALR1 gene
or2b3 gene,or2b3 gene,{OR2B3 gene},1,5564,OR2B3 gene
tead4 gene,tead4 gene,{TEAD4 gene},1,5564,TEAD4 gene
tsr3 gene,tsr3 gene,{TSR3 gene},1,5564,TSR3 gene


In [215]:
terms_count_assoc_support = (
    py_.chain(combined_evidence)
    .filter(
        lambda e: "supporting" in e["assoc_evidence"].keys()
        and len(e["assoc_evidence"]["supporting"]) > 0
    )
    .map(lambda e: [e["subject_term"], e["object_term"]])
    .flatten()
    .thru(lambda o: make_word_freq(o))
    .value()
)
terms_count_triple_support = (
    py_.chain(combined_evidence)
    .filter(
        lambda e: "supporting" in e["triple_evidence"].keys()
        and len(e["triple_evidence"]["supporting"]) > 0
    )
    .map(lambda e: [e["subject_term"], e["object_term"]])
    .flatten()
    .thru(lambda o: make_word_freq(o))
    .value()
)

terms_count_tbl = (
    terms_count_support[["term", "count"]]
    .rename(columns={"count": "count_support"})
    .merge(
        terms_count_all[["term", "count"]].rename(columns={"count": "count_any"}),
        on=["term"],
    )
    .merge(
        terms_count_triple_support[["term", "count"]].rename(
            columns={"count": "count_triple_support"}
        ),
        on=["term"],
    )
    .merge(
        terms_count_assoc_support[["term", "count"]].rename(
            columns={"count": "count_assoc_support"}
        ),
        on=["term"],
    )
    .merge(
        terms_count_init[["term", "count"]].rename(columns={"count": "count_init"}),
        on=["term"],
    )
    .sort_values(by=["count_support"], ascending=False)
    .reset_index(drop=True)
)
terms_count_tbl.head(20)

Unnamed: 0,term,count_support,count_any,count_triple_support,count_assoc_support,count_init
0,Disease,41,77,74,44,715
1,Obesity,20,30,25,25,125
2,Diabetes,17,20,19,18,87
3,Depressive disorder,14,26,20,16,100
4,Parkinson Disease,13,13,13,13,111
5,"Diabetes Mellitus, Non-Insulin-Dependent",10,15,12,12,84
6,Alzheimer's Disease,8,10,10,8,111
7,Schizophrenia,8,11,11,8,32
8,C-reactive protein,7,10,7,9,24
9,Malignant Neoplasms,7,19,8,15,100


In [216]:
output_file = analysis_assets_dir / "case_term_count.tex"
terms_count_tbl.style.to_latex(output_file)

# Individual

In [58]:
PRIMARY_TERM = "Obesity"
# PRIMARY_TERM = "Depressive disorder"

In [59]:
def _make_score(e):
    res = sum([_["evidence_score"] for _ in e])
    return res


def _extract_data(e):
    triple_supporting = (
        _make_score(e["triple_evidence"]["supporting"])
        if "supporting" in e["triple_evidence"].keys()
        else 0
    )
    assoc_supporting = (
        _make_score(e["assoc_evidence"]["supporting"])
        if "supporting" in e["assoc_evidence"].keys()
        else 0
    )
    res = {
        "doi": e["doi"],
        "triple": e["triple"],
        "subject_term": e["subject_term"],
        "object_term": e["object_term"],
        "pred_term": e["pred_term"],
        "pred_group": epigraphdb.PRED_DIRECTIONAL_MAPPING[e["pred_term"]],
        "triple_supporting": triple_supporting,
        "assoc_supporting": assoc_supporting,
        "score_supporting": triple_supporting + assoc_supporting,
    }
    return res


def make_case(primary_term, combined_evidence):
    _is_term_first = lambda e, primary_term: (
        e["subject_term"] == primary_term or e["object_term"] == primary_term
    )
    _is_term_second = lambda e, terms: (
        e["subject_term"] in terms and e["object_term"] in terms
    )

    valid_terms = (
        py_.chain(combined_evidence)
        .filter(_is_term_first)
        .filter(lambda e: _is_term_first(e, primary_term))
        .map(lambda e: [e["subject_term"], e["object_term"]])
        .flatten()
        .uniq()
        .thru(lambda o: [_ for _ in o if _ != primary_term])
        .value()
    )

    case_df = (
        py_.chain(combined_evidence)
        .filter(
            lambda e: _is_term_first(e, primary_term) or _is_term_second(e, valid_terms)
        )
        .map(_extract_data)
        .thru(lambda o: pd.DataFrame(o))
        .value()
    )
    return case_df


case_df = make_case(primary_term=PRIMARY_TERM, combined_evidence=combined_evidence)
print(case_df.info())
case_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   doi                30 non-null     object 
 1   triple             30 non-null     object 
 2   subject_term       30 non-null     object 
 3   object_term        30 non-null     object 
 4   pred_term          30 non-null     object 
 5   pred_group         30 non-null     object 
 6   triple_supporting  30 non-null     float64
 7   assoc_supporting   30 non-null     float64
 8   score_supporting   30 non-null     float64
dtypes: float64(3), object(6)
memory usage: 2.2+ KB
None


Unnamed: 0,doi,triple,subject_term,object_term,pred_term,pred_group,triple_supporting,assoc_supporting,score_supporting
0,10.1101/2020.01.08.20016980,Diabetes Mellitus:COEXISTS_WITH:Obesity,Diabetes Mellitus,Obesity,COEXISTS_WITH,undirectional,18.79926,10.794803,29.594063
1,10.1101/2020.03.19.20031138,Obesity:AFFECTS:Colorectal Carcinoma,Obesity,Colorectal Carcinoma,AFFECTS,directional,0.0,11.951282,11.951282
2,10.1101/2020.03.19.20031138,Colorectal Carcinoma:COEXISTS_WITH:Obesity,Colorectal Carcinoma,Obesity,COEXISTS_WITH,undirectional,0.0,8.321694,8.321694
3,10.1101/2020.04.02.20049031,Obesity:AFFECTS:Malignant neoplasm of prostate,Obesity,Malignant neoplasm of prostate,AFFECTS,directional,0.680812,1.219657,1.900469
4,10.1101/2020.04.20.20016337,Hypertensive disease:AFFECTS:Obesity,Hypertensive disease,Obesity,AFFECTS,directional,5.3467,0.0,5.3467
5,10.1101/2020.04.20.20072223,Obesity:AFFECTS:Diabetes,Obesity,Diabetes,AFFECTS,directional,15.905478,18.528986,34.434464
6,10.1101/2020.05.11.20097873,Chronic disease:COEXISTS_WITH:Obesity,Chronic disease,Obesity,COEXISTS_WITH,undirectional,17.796887,1.041043,18.83793
7,10.1101/2020.06.22.20137182,Chronic disease:COEXISTS_WITH:Obesity,Chronic disease,Obesity,COEXISTS_WITH,undirectional,17.796887,1.041043,18.83793
8,10.1101/2020.08.20.20176214,Obesity:COEXISTS_WITH:Fatty Liver,Obesity,Fatty Liver,COEXISTS_WITH,undirectional,17.349626,0.0,17.349626
9,10.1101/2020.09.02.20186619,Diabetes:AFFECTS:Obesity,Diabetes,Obesity,AFFECTS,directional,14.666161,3.97632,18.642481


## network plot, theme specific

In [60]:
obesity_theme_terms = [
    "Obesity",
    "Diabetes",
    "Diabetes Mellitus, Non-Insulin-Dependent",
    "Chronic Kidney Diseases",
]
mental_theme_terms = [
    "Depressive disorder",
    "Parkinson Disease",
    "Alzheimer's Disease",
    "Schizophrenia",
]

In [61]:
def make_theme_df(terms: List[str]):
    res = (
        pd.concat(
            [
                make_case(primary_term=_, combined_evidence=combined_evidence).assign(
                    primary_term=_
                )
                for _ in terms
            ]
        )
        .reset_index(drop=True)[
            [
                "subject_term",
                "object_term",
                "primary_term",
                "score_supporting",
            ]
        ]
        .rename(columns={"score_supporting": "value"})
        .groupby(["subject_term", "object_term", "primary_term"])
        .sum()
        .reset_index(drop=False)
    )
    return res


obesity_theme_df = make_theme_df(terms=obesity_theme_terms)
mental_theme_df = make_theme_df(terms=mental_theme_terms)
obesity_theme_df

Unnamed: 0,subject_term,object_term,primary_term,value
0,Age,"Diabetes Mellitus, Non-Insulin-Dependent","Diabetes Mellitus, Non-Insulin-Dependent",2.435558
1,Age,Obesity,Obesity,12.136559
2,Albuminuria,Diabetes,Diabetes,18.127536
3,Antihypertensive Agents,Diabetes,Diabetes,43.673428
4,Blood Glucose,Chronic Kidney Diseases,Chronic Kidney Diseases,4.217776
...,...,...,...,...
58,Parkinson Disease,"Diabetes Mellitus, Non-Insulin-Dependent","Diabetes Mellitus, Non-Insulin-Dependent",21.993378
59,Sleep,"Diabetes Mellitus, Non-Insulin-Dependent","Diabetes Mellitus, Non-Insulin-Dependent",9.179310
60,Sleep disturbances,"Diabetes Mellitus, Non-Insulin-Dependent","Diabetes Mellitus, Non-Insulin-Dependent",0.000000
61,Triglycerides,Diabetes,Diabetes,75.793859


In [62]:
def make_network(theme_df, primary_terms):

    nodes_df = (
        py_.chain(theme_df.to_dict(orient="records"))
        .map(lambda e: [e["subject_term"], e["object_term"]])
        .flatten()
        .uniq()
        .thru(
            lambda o: (
                pd.Series(o)
                .to_frame(name="term")
                .merge(
                    terms_count_tbl[["term", "count_support"]], on=["term"], how="left"
                )
                .fillna(1)
            )
        )
        .value()
        .rename(columns={"term": "node", "count_support": "value"})
    )
    edges_df = theme_df.rename(
        columns={"subject_term": "source", "object_term": "target"}
    )[["source", "target", "value"]]

    nt = Network("768", "1024")

    for idx, _ in nodes_df.iterrows():
        color = "red" if _["node"] in primary_terms else "lightblue"
        nt.add_node(n_id=_["node"], label=_["node"], value=_["value"], color=color)

    for idx, _ in edges_df.iterrows():
        nt.add_edge(source=_["source"], to=_["target"], value=_["value"], color="grey")

    return nt


nt = make_network(obesity_theme_df, obesity_theme_terms)
nt.show("nt_obesity.html")

                                                                               


In [63]:
nt = make_network(mental_theme_df, mental_theme_terms)
nt.show("nt_mental.html")

                                                                               
