# Init

In [1]:
%load_ext autoreload
%load_ext lab_black

%autoreload 2

In [2]:
import json

import pandas as pd
from pydash import py_

from analysis import utils
from analysis.funcs.generic import interval_str
from common_processing.resources import epigraphdb

In [3]:
INTERVAL = "2020-01-01/2021-12-31"
interval_fmt = interval_str(INTERVAL)
DATA_ROOT = utils.find_data_root()
data_dir = DATA_ROOT / "medrxiv_experiments" / interval_fmt / "default"
print(data_dir)
assert data_dir.exists()

/data/ik18445_cache/projects/evidence-retrieval-base/data/medrxiv_experiments/2020-01-01__2021-12-31/default


## datasets

In [5]:
abstracts_file = data_dir.parent / "medrxiv_abstracts_processed.csv"
assert abstracts_file.exists()
abstracts_df = pd.read_csv(abstracts_file)
print(abstracts_df.info())
print(abstracts_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26846 entries, 0 to 26845
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   doi                               26846 non-null  object
 1   title                             26846 non-null  object
 2   authors                           26846 non-null  object
 3   author_corresponding              26846 non-null  object
 4   author_corresponding_institution  26837 non-null  object
 5   date                              26846 non-null  object
 6   version                           26846 non-null  int64 
 7   type                              26846 non-null  object
 8   license                           26833 non-null  object
 9   category                          26846 non-null  object
 10  jatsxml                           26846 non-null  object
 11  abstract                          26846 non-null  object
 12  published         

In [6]:
combined_evidence_file = data_dir / "evidence" / "combined_score.json"
assert combined_evidence_file.exists()
with combined_evidence_file.open() as f:
    combined_evidence = json.load(f)
print(len(combined_evidence))
evidence_df = pd.DataFrame(combined_evidence)
print(evidence_df.info())

413
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   doi              413 non-null    object
 1   triple           413 non-null    object
 2   efo_ents         413 non-null    object
 3   umls_ents        413 non-null    object
 4   trait_ents       413 non-null    object
 5   subject_term     413 non-null    object
 6   object_term      413 non-null    object
 7   pred_term        413 non-null    object
 8   triple_evidence  413 non-null    object
 9   assoc_evidence   413 non-null    object
dtypes: object(10)
memory usage: 32.4+ KB
None


# Subset

In [8]:
def _print_subset(df):

    for idx, _ in enumerate(df[:20]):
        url = EVIDENCE_URL.format(
            subject=_["subject_term"].replace(" ", "%20"),
            predicate=_["pred_term"],
            object=_["object_term"].replace(" ", "%20"),
        )
        print(f"#{idx} triple: {_['triple']}")
        print(f"    doi: https://doi.org/{_['doi']}")
        print(f"    evidence: {url}")
        print("Context: ----------")
        print(_["efo_ents"]["context_text"])
        print("Context: ----------")
        print("\n")
    print("\n============\n")

## directional subset

In [9]:
EVIDENCE_URL = "http://ieu-db-interface.epi.bris.ac.uk:8626/triple?subject={subject}&predicate={predicate}&object={object}&analysis"
for pred in ["CAUSES", "TREATS", "AFFECTS", "PRODUCES"]:
    evidence_subset = [
        _
        for _ in combined_evidence
        if len([k for k, v in _["triple_evidence"].items() if len(v) > 0]) == 2
        and len([k for k, v in _["assoc_evidence"].items() if len(v) > 0]) == 4
        and _["pred_term"] == pred
    ]

    print(
        f"# Predicate {pred}; total number with full evidence types {len(evidence_subset)}\n"
    )

    _print_subset(evidence_subset)

# Predicate CAUSES; total number with full evidence types 8

#0 triple: Valvular disease:CAUSES:Heart failure
    doi: https://doi.org/10.1101/2020.06.04.20122010
    evidence: http://ieu-db-interface.epi.bris.ac.uk:8626/triple?subject=Valvular%20disease&predicate=CAUSES&object=Heart%20failure&analysis
Context: ----------
RationaleMitral valve prolapse (MVP) is a common valve disease that leads to mitral insufficiency, heart failure and sudden death.
Context: ----------


#1 triple: Myocardial Infarction:CAUSES:Acute myocardial infarction
    doi: https://doi.org/10.1101/2020.07.03.20145987
    evidence: http://ieu-db-interface.epi.bris.ac.uk:8626/triple?subject=Myocardial%20Infarction&predicate=CAUSES&object=Acute%20myocardial%20infarction&analysis
Context: ----------
ConclusionIn addition to classic risk factors of myocardial infarction, health care systems must pay more attention to triggers that may induce an acute myocardial infarction in people with predisposing factors especiall

## undirectional subset

In [10]:
EVIDENCE_URL = "http://ieu-db-interface.epi.bris.ac.uk:8626/triple?subject={subject}&predicate={predicate}&object={object}&analysis"
for pred in ["ASSOCIATED_WITH", "COEXISTS_WITH", "INTERACTS_WITH"]:
    evidence_subset = [
        _
        for _ in combined_evidence
        if len([k for k, v in _["triple_evidence"].items() if len(v) > 0]) == 1
        and len([k for k, v in _["assoc_evidence"].items() if len(v) > 0]) == 2
        and _["pred_term"] == pred
    ]

    print(
        f"# Predicate {pred}; total number with full evidence types {len(evidence_subset)}\n"
    )

    _print_subset(evidence_subset)

# Predicate ASSOCIATED_WITH; total number with full evidence types 33

#0 triple: Sleep:ASSOCIATED_WITH:Alzheimer's Disease
    doi: https://doi.org/10.1101/2020.02.26.20027912
    evidence: http://ieu-db-interface.epi.bris.ac.uk:8626/triple?subject=Sleep&predicate=ASSOCIATED_WITH&object=Alzheimer's%20Disease&analysis
Context: ----------
Our goal was to assess whether genome-wide polygenic risk scores (PRS) for AD associate with sleep phenotypes in young adults, decades before typical AD symptom onset.
Context: ----------


#1 triple: Testosterone:ASSOCIATED_WITH:Malignant neoplasm of prostate
    doi: https://doi.org/10.1101/2020.03.27.20044941
    evidence: http://ieu-db-interface.epi.bris.ac.uk:8626/triple?subject=Testosterone&predicate=ASSOCIATED_WITH&object=Malignant%20neoplasm%20of%20prostate&analysis
Context: ----------
Total testosterone was not associated with prostate cancer.
Context: ----------


#2 triple: C-reactive protein:ASSOCIATED_WITH:Vitamin D Deficiency
    doi: htt

# term frequency

In [12]:
evidence_subset = [
    _
    for _ in combined_evidence
    if len([k for k, v in _["triple_evidence"].items() if len(v) > 0]) == 2
    and len([k for k, v in _["assoc_evidence"].items() if len(v) > 0]) == 4
    and _["pred_term"] == "CAUSES"
]
subset = (
    py_.chain(evidence_subset)
    .map(lambda _: [_["subject_term"], _["object_term"]])
    .flatten()
    .value()
)
print(subset)

['Valvular disease', 'Heart failure', 'Myocardial Infarction', 'Acute myocardial infarction', 'Heart Diseases', 'Pulmonary Hypertension', 'Heat Stress Disorders', 'Malaise', 'Low Back Pain', 'Chronic pain', 'Ankylosing spondylitis', 'Arthritis', 'Migraine Disorders', 'Dizziness', 'Atrial Fibrillation', 'Cardioembolic stroke']


In [13]:
from typing import List


def make_word_freq(terms: List[str]) -> pd.DataFrame():
    word_freq = {}
    tokens = (
        py_.chain(terms)
        .map(lambda term: [_.strip().lower() for _ in term.split(",")])
        .flatten()
        .value()
    )
    for token in tokens:
        if token not in word_freq.keys():
            word_freq[token] = 1
        else:
            word_freq[token] += 1
    word_freq_df = (
        pd.Series(word_freq)
        .sort_values(ascending=False)
        .to_frame(name="count")
        .assign(total_num_tokens=len(tokens))
    )
    return word_freq_df


df = make_word_freq(terms=subset)
print(df[:10])

                             count  total_num_tokens
valvular disease                 1                16
heart failure                    1                16
myocardial infarction            1                16
acute myocardial infarction      1                16
heart diseases                   1                16
pulmonary hypertension           1                16
heat stress disorders            1                16
malaise                          1                16
low back pain                    1                16
chronic pain                     1                16


## cases with full evidence groups

In [14]:
pred_groups = [
    {
        "preds": ["CAUSES", "TREATS", "AFFECTS", "PRODUCES"],
        "num_triple_evidence": 2,
        "num_assoc_evidence": 4,
    },
    {
        "preds": ["ASSOCIATED_WITH", "COEXISTS_WITH", "INTERACTS_WITH"],
        "num_triple_evidence": 1,
        "num_assoc_evidence": 2,
    },
]

for pred_group in pred_groups:
    print(f"preds {pred_group['preds']}")
    evidence_subset = [
        _
        for _ in combined_evidence
        if len([k for k, v in _["triple_evidence"].items() if len(v) > 0])
        == pred_group["num_triple_evidence"]
        and len([k for k, v in _["assoc_evidence"].items() if len(v) > 0])
        == pred_group["num_assoc_evidence"]
        and _["pred_term"] in pred_group["preds"]
    ]
    terms = (
        py_.chain(evidence_subset)
        .map(lambda _: [_["subject_term"], _["object_term"]])
        .flatten()
        .value()
    )
    word_freq_df = make_word_freq(terms=terms)
    print(word_freq_df[:20])

preds ['CAUSES', 'TREATS', 'AFFECTS', 'PRODUCES']
                        count  total_num_tokens
diabetes                    5                37
myocardial infarction       3                37
atrial fibrillation         2                37
carotid stenosis            2                37
obesity                     2                37
valvular disease            1                37
sleep                       1                37
dizziness                   1                37
migraine disorders          1                37
arthritis                   1                37
ankylosing spondylitis      1                37
coronary circulation        1                37
chronic pain                1                37
low back pain               1                37
non-insulin-dependent       1                37
diabetes mellitus           1                37
infarction                  1                37
heart failure               1                37
cerebral infarction         1         

## cases with at least "supporting"

In [15]:
for pred_group in pred_groups:
    print(f"preds {pred_group['preds']}")
    evidence_subset = [
        _
        for _ in combined_evidence
        if "supporting" in _["triple_evidence"].keys()
        and len(_["triple_evidence"]["supporting"]) > 0
        and "supporting" in _["assoc_evidence"].keys()
        and len(_["assoc_evidence"]["supporting"]) > 0
        and _["pred_term"] in pred_group["preds"]
    ]
    terms = (
        py_.chain(evidence_subset)
        .map(lambda _: [_["subject_term"], _["object_term"]])
        .flatten()
        .value()
    )
    word_freq_df = make_word_freq(terms=terms)
    print(word_freq_df[:20])

preds ['CAUSES', 'TREATS', 'AFFECTS', 'PRODUCES']
                         count  total_num_tokens
diabetes mellitus            9               134
obesity                      8               134
diabetes                     8               134
non-insulin-dependent        8               134
disease                      7               134
parkinson disease            4               134
blood glucose                4               134
glucose                      3               134
myocardial infarction        3               134
behavior                     3               134
depressive disorder          3               134
chronic kidney diseases      3               134
gout                         2               134
carotid stenosis             2               134
asthma                       2               134
atrial fibrillation          2               134
alzheimer's disease          2               134
heart failure                2               134
depressive symptoms