# Analysis general init

# Init

In [1]:
%load_ext autoreload
%load_ext lab_black

%autoreload 2

In [126]:
import json

import numpy as np
import pandas as pd
from pydash import py_
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

from analysis import utils
from analysis.funcs.generic import interval_str
from common_processing.resources import epigraphdb, EVIDENCE_LABELS

In [3]:
INTERVAL = "2020-01-01/2021-12-31"
interval_fmt = interval_str(INTERVAL)
DATA_ROOT = utils.find_data_root()
data_dir = DATA_ROOT / "medrxiv_experiments" / interval_fmt / "default"
print(data_dir)
assert data_dir.exists()

analysis_assets_dir = utils.find_analysis_artifacts_dir()
print(analysis_assets_dir)

/data/ik18445_cache/projects/epigraphdb-asq/data/medrxiv_experiments/2020-01-01__2021-12-31/default
/data/ik18445_cache/projects/epigraphdb-asq/data/analysis-artifacts


# Load files

In [4]:
triple_file = data_dir.parent / "triples.json"
assert triple_file.exists()
with triple_file.open("r") as f:
    triples = json.load(f)
print(len(triples))

26846


In [5]:
efo_ents_file = data_dir / "ents" / "efo_ents.json"
assert efo_ents_file.exists()
with efo_ents_file.open("r") as f:
    efo_ents = json.load(f)
print(len(efo_ents))

1096


In [6]:
umls_ents_file = data_dir / "ents" / "umls_ents.json"
assert umls_ents_file.exists()
with umls_ents_file.open("r") as f:
    umls_ents = json.load(f)
print(len(umls_ents))

1096


In [7]:
trait_ents_file = data_dir / "ents" / "trait_ents.json"
assert trait_ents_file.exists()
with trait_ents_file.open("r") as f:
    trait_ents = json.load(f)
print(len(trait_ents))

348


In [8]:
combined_ents_file = data_dir / "ents" / "combined_ents.json"
assert combined_ents_file.exists()
with combined_ents_file.open() as f:
    combined_ents = pd.DataFrame(json.load(f))

print(
    combined_ents[["pred_term"]]
    .assign(
        pred_group=lambda df: df["pred_term"].apply(
            lambda x: epigraphdb.PRED_DIRECTIONAL_MAPPING[x]
        )
    )
    .groupby(["pred_term", "pred_group"])
    .size()
)

pred_term        pred_group   
AFFECTS          directional       85
ASSOCIATED_WITH  undirectional     66
CAUSES           directional       67
COEXISTS_WITH    undirectional    171
INTERACTS_WITH   undirectional      4
TREATS           directional       22
dtype: int64


In [9]:
triple_evidence_file = data_dir / "evidence" / "triple_evidence.json"
assert triple_evidence_file.exists()
with triple_evidence_file.open("r") as f:
    triple_evidence = json.load(f)
triple_evidence_df = pd.DataFrame(triple_evidence)
print(len(triple_evidence_df))

assoc_evidence_file = data_dir / "evidence" / "assoc_evidence.json"
assert assoc_evidence_file.exists()
with assoc_evidence_file.open("r") as f:
    assoc_evidence = json.load(f)
assoc_evidence_df = pd.DataFrame(assoc_evidence)
print(len(assoc_evidence_df))

combined_evidence_file = data_dir / "evidence" / "combined_score.json"
assert combined_evidence_file.exists()
with combined_evidence_file.open("r") as f:
    combined_evidence = json.load(f)
evidence_df = pd.DataFrame(combined_evidence)
print(len(evidence_df))

print(
    evidence_df[["pred_term"]]
    .assign(
        pred_group=lambda df: df["pred_term"].apply(
            lambda x: epigraphdb.PRED_DIRECTIONAL_MAPPING[x]
        )
    )
    .groupby(["pred_term", "pred_group"])
    .size()
)

415
414
413
pred_term        pred_group   
AFFECTS          directional       85
ASSOCIATED_WITH  undirectional     66
CAUSES           directional       67
COEXISTS_WITH    undirectional    170
INTERACTS_WITH   undirectional      4
TREATS           directional       21
dtype: int64
