In [1]:
%load_ext autoreload
%load_ext lab_black

%autoreload 2

In [7]:
import json

import requests
import pandas as pd

from analysis import utils

In [3]:
output_dir = utils.find_analysis_artifacts_dir()
print(output_dir)

/data/ik18445_cache/projects/evidence-retrieval-base/manuscript/assets/artifacts


# Triple count

In [4]:
query = """
MATCH
    (sub:LiteratureTerm)-[r:SEMMEDDB_SUB]-
    (triple:LiteratureTriple)-
    [r1:SEMMEDDB_OBJ]-(obj:LiteratureTerm)
WHERE
    triple.predicate IN [
        "INTERACTS_WITH", "COEXISTS_WITH",
        "ASSOCIATED_WITH", "CAUSES", "TREATS",
        "PRODUCES", "AFFECTS"
    ]
RETURN
    sub.type as sub_type,
    triple.predicate as pred,
    obj.type as obj_type,
    count(*) as num_ents
ORDER BY num_ents DESC LIMIT 50
"""
payload = {"query": query}
r = requests.post("https://api.epigraphdb.org/cypher", json=payload)
if not r.ok:
    print(r.text)

In [6]:
data = r.json()["results"]
df = pd.json_normalize(data)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sub_type  50 non-null     object
 1   pred      50 non-null     object
 2   obj_type  50 non-null     object
 3   num_ents  50 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ KB
None


Unnamed: 0,sub_type,pred,obj_type,num_ents
0,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm]",393759
1,"[aapp, gngm]",COEXISTS_WITH,"[aapp, gngm]",224098
2,"[aapp, gngm]",ASSOCIATED_WITH,[dsyn],188961
3,[dsyn],COEXISTS_WITH,[dsyn],150166
4,[dsyn],CAUSES,[dsyn],85231


In [9]:
triple_count_file = output_dir / "triple-category-count.json"
with triple_count_file.open("w") as f:
    json.dump(df.to_dict(orient="records"), f)

In [10]:
with triple_count_file.open() as f:
    triple_count_df = pd.DataFrame(json.load(f))
triple_count_df

Unnamed: 0,sub_type,pred,obj_type,num_ents
0,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm]",393759
1,"[aapp, gngm]",COEXISTS_WITH,"[aapp, gngm]",224098
2,"[aapp, gngm]",ASSOCIATED_WITH,[dsyn],188961
3,[dsyn],COEXISTS_WITH,[dsyn],150166
4,[dsyn],CAUSES,[dsyn],85231
5,"[orch, phsu]",TREATS,[dsyn],82263
6,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm, enzy]",72194
7,"[aapp, gngm]",PRODUCES,"[aapp, gngm]",69691
8,"[aapp, gngm, enzy]",INTERACTS_WITH,"[aapp, gngm]",66742
9,"[aapp, gngm]",CAUSES,[dsyn],49178


# Literature count

In [11]:
query = """
MATCH
    (sub:LiteratureTerm)-[r:SEMMEDDB_SUB]-
    (triple:LiteratureTriple)-
    [r1:SEMMEDDB_OBJ]-(obj:LiteratureTerm)
WHERE
    triple.predicate IN [
        "INTERACTS_WITH", "COEXISTS_WITH",
        "ASSOCIATED_WITH", "CAUSES", "TREATS",
        "PRODUCES", "AFFECTS"
    ]
WITH
  sub, triple, obj
MATCH
  (triple)-[r:SEMMEDDB_TO_LIT]->(lit:Literature)
RETURN
    sub.type as sub_type,
    triple.predicate as pred,
    obj.type as obj_type,
    count(*) as num_ents
ORDER BY num_ents DESC LIMIT 50
"""
payload = {"query": query}
r = requests.post("https://api.epigraphdb.org/cypher", json=payload)
if not r.ok:
    print(r.text)
data = r.json()["results"]
df = pd.json_normalize(data)
print(df.info())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sub_type  50 non-null     object
 1   pred      50 non-null     object
 2   obj_type  50 non-null     object
 3   num_ents  50 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ KB
None


Unnamed: 0,sub_type,pred,obj_type,num_ents
0,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm]",673470
1,"[aapp, gngm]",ASSOCIATED_WITH,[dsyn],423727
2,[dsyn],COEXISTS_WITH,[dsyn],385349
3,"[aapp, gngm]",COEXISTS_WITH,"[aapp, gngm]",332834
4,"[orch, phsu]",TREATS,[dsyn],274589
5,[phsu],TREATS,[dsyn],238636
6,[dsyn],CAUSES,[dsyn],222462
7,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm, enzy]",140836
8,"[aapp, gngm, enzy]",INTERACTS_WITH,"[aapp, gngm]",133378
9,"[aapp, gngm]",PRODUCES,"[aapp, gngm]",106862


In [12]:
literature_count_file = output_dir / "literature-category-count.json"
with literature_count_file.open("w") as f:
    json.dump(df.to_dict(orient="records"), f)

In [14]:
with literature_count_file.open() as f:
    literature_count_df = pd.DataFrame(json.load(f))
literature_count_df

Unnamed: 0,sub_type,pred,obj_type,num_ents
0,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm]",673470
1,"[aapp, gngm]",ASSOCIATED_WITH,[dsyn],423727
2,[dsyn],COEXISTS_WITH,[dsyn],385349
3,"[aapp, gngm]",COEXISTS_WITH,"[aapp, gngm]",332834
4,"[orch, phsu]",TREATS,[dsyn],274589
5,[phsu],TREATS,[dsyn],238636
6,[dsyn],CAUSES,[dsyn],222462
7,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm, enzy]",140836
8,"[aapp, gngm, enzy]",INTERACTS_WITH,"[aapp, gngm]",133378
9,"[aapp, gngm]",PRODUCES,"[aapp, gngm]",106862


# Formatting

In [21]:
def _make_idx(df):
    res = df.apply(
        lambda row: str(row["sub_type"]) + row["pred"] + str(row["obj_type"]), axis=1
    )
    return res


combined_count = (
    triple_count_df.assign(idx=_make_idx)
    .rename(columns={"num_ents": "num_triple_ent"})
    .merge(
        literature_count_df.assign(idx=_make_idx)
        .rename(columns={"num_ents": "num_literature_ent"})
        .drop(columns=["sub_type", "pred", "obj_type"]),
        on=["idx"],
    )
    .drop(columns=["idx"])
)
combined_count

Unnamed: 0,sub_type,pred,obj_type,num_triple_ent,num_literature_ent
0,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm]",393759,673470
1,"[aapp, gngm]",COEXISTS_WITH,"[aapp, gngm]",224098,332834
2,"[aapp, gngm]",ASSOCIATED_WITH,[dsyn],188961,423727
3,[dsyn],COEXISTS_WITH,[dsyn],150166,385349
4,[dsyn],CAUSES,[dsyn],85231,222462
5,"[orch, phsu]",TREATS,[dsyn],82263,274589
6,"[aapp, gngm]",INTERACTS_WITH,"[aapp, gngm, enzy]",72194,140836
7,"[aapp, gngm]",PRODUCES,"[aapp, gngm]",69691,106862
8,"[aapp, gngm, enzy]",INTERACTS_WITH,"[aapp, gngm]",66742,133378
9,"[aapp, gngm]",CAUSES,[dsyn],49178,100681


In [30]:
combined_count_select = (
    combined_count.assign(
        umls_ent_types=lambda df: df.apply(
            lambda s: set(s["sub_type"] + s["obj_type"]), axis=1
        )
    )
    .drop(columns=["sub_type", "obj_type"])
    .sort_values("num_triple_ent", ascending=False)
    .groupby(["pred"])
    .head(2)
    .reset_index(drop=True)
)
combined_count_select

Unnamed: 0,pred,num_triple_ent,num_literature_ent,umls_ent_types
0,INTERACTS_WITH,393759,673470,"{aapp, gngm}"
1,COEXISTS_WITH,224098,332834,"{aapp, gngm}"
2,ASSOCIATED_WITH,188961,423727,"{aapp, dsyn, gngm}"
3,COEXISTS_WITH,150166,385349,{dsyn}
4,CAUSES,85231,222462,{dsyn}
5,TREATS,82263,274589,"{phsu, dsyn, orch}"
6,INTERACTS_WITH,72194,140836,"{aapp, enzy, gngm}"
7,PRODUCES,69691,106862,"{aapp, gngm}"
8,CAUSES,49178,100681,"{aapp, dsyn, gngm}"
9,TREATS,47416,238636,"{phsu, dsyn}"


In [31]:
combined_triple_count_file = output_dir / "report-triple-count.csv"
combined_count_select.to_csv(combined_triple_count_file, index=False)