# Benchmarks

## Initialize

In [None]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = 220627
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

In [None]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").sort_values("endpoint")

In [None]:
#endpoints = [e[:-6] for e in data_outcomes.columns if "_event" in e]
endpoints = [
#    "phecode_008",
#    "phecode_092-2",
#    "phecode_105",
#    "phecode_107-2",
#    "phecode_164",
#    "phecode_202-2",
#    "phecode_284",
#    "phecode_292",
#    "phecode_324-11",
#    "phecode_328",
#    "phecode_371",
#    "phecode_401",
#    "phecode_404",
#    "phecode_424",
#    "phecode_440-11",
#    "phecode_468",
#    "phecode_474",
#    "phecode_522-1",
#    "phecode_542-1",
#    "phecode_581-1",
#    "phecode_583",
#    "phecode_665",
#    "phecode_705-1",
    "OMOP_4306655"  
]

In [None]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220627.feather").set_index("eid")
data_outcomes = data_outcomes[[c for c in data_outcomes.columns if "_event" in c and c[:-6] in endpoints]]

In [None]:
data_records = pd.read_feather(f"{output_path}/baseline_records_220627.feather").set_index("eid")

In [None]:
data_records = data_records[[c for c in tqdm(data_records.columns.to_list()) if "OMOP_" in c]]

In [None]:
records = data_records.columns.to_list()

In [None]:
data_all = data_records.merge(data_outcomes, left_index=True, right_index=True, how="left")

In [None]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_220627.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [None]:
record_freqs = data_records.sum().sort_values(ascending=False).pipe(lambda x: x[x>=50])
record_freqs

In [None]:
import ray

ray.init(num_cpus=24, include_dashboard=False)#dashboard_port=24763, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

In [None]:
@ray.remote
def calc_ratio(data_all, eids_dict, record, eids_record, eids_nonrecord, endpoints):
    r_ds = []
    
    for endpoint in endpoints:
        eids_endpoint = eids_dict[endpoint]
        
        # record set
        eid_idxs_dict = {}
        eid_idxs_dict["record"] = np.where(np.in1d(eids_endpoint, eids_record, assume_unique=True))[0]
        eid_idxs_dict["nonrecord"] = np.where(np.in1d(eids_endpoint, eids_nonrecord, assume_unique=True))[0]

        for key, eid_idxs in eid_idxs_dict.items():
            eids_temp = eids_endpoint[eid_idxs]
            s = data_all[f"{endpoint}_event"].loc[eids_temp]
            n=s.sum()
            freq = n/len(s)
            
            if key=="record":
                s_record
                n_record = n
                freq_record = freq
                
            if key=="nonrecord":
                s_nonrecord = s
                n_nonrecord = n
                freq_nonrecord = freq
        
        #jaccard = n_record / (n_nonrecord + len(eid_idxs_dict["record"])) 
        
        r_ds.append({"endpoint": endpoint, "n_eligable": len(eids_dict[endpoint]), 
                  "record": record, "n_records": len(eids_record), 
                  "n_events_record": n_record, "freq_events_record": freq_record,
                    "n_events_nonrecord": n_nonrecord, "freq_events_nonrecord": freq_nonrecord})
    return r_ds

In [None]:
d_nested = []
ref_data_all = ray.put(data_all)
ref_eids_dict = ray.put(eids_dict)
for record in tqdm(record_freqs.index):
    s_record = data_all[record]
    eids_record = s_record[s_record==True].index.values
    eids_nonrecord = s_record[s_record==False].index.values
    ref_results = calc_ratio.remote(ref_data_all, ref_eids_dict, record, eids_record, eids_nonrecord, endpoints)
    d_nested.append(ref_results)
d_nested = [ray.get(e) for e in tqdm(d_nested)]
del ref_data_all
del ref_eids_dict

In [None]:
from itertools import chain

d = list(chain(*d_nested))

In [None]:
endpoints_freqs = pd.DataFrame().from_dict(d)

In [None]:
endpoints_freqs.to_feather(f"{experiment_path}/records_inc_disease_freq.feather")