# Benchmarks

## Initialize

In [1]:
%load_ext autoreload
%autoreload 2

import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '231117'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

In [3]:
today = experiment

In [28]:
phecode = pd.read_csv(f"{base_path}/mapping/phecode_strings_V2.csv")[["phecode_string", "sex"]]
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").merge(phecode, how="left", on="phecode_string").set_index("endpoint")#
endpoints_md.iloc[0, endpoints_md.columns.get_loc('sex')] = "Both"
endpoints = endpoints_md.index.to_list()
endpoints_md

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OMOP_4306655,111,61213.0,3471.0,0.056704,Death,Event,Both
phecode_052,31,61142.0,197.0,0.003222,Herpesvirus,ID,Both
phecode_052-3,28,61175.0,121.0,0.001978,Varicella zoster virus,ID,Both
phecode_052-32,27,61184.0,117.0,0.001912,Herpes zoster,ID,Both
phecode_061,40,61194.0,185.0,0.003023,Influenza virus,ID,Both
...,...,...,...,...,...,...,...
phecode_829,129,60763.0,1980.0,0.032586,Nonspecific findings on examination of blood,Signs/Symptoms,Both
phecode_848,85,61196.0,143.0,0.002337,Nonspecific abnormal findings of other body st...,Signs/Symptoms,Both
phecode_979,21,61120.0,218.0,0.003567,Transplated organ,Stat,Both
phecode_981,122,60297.0,1966.0,0.032605,Family history of malignant neoplasm,Stat,Both


In [29]:
# data_covariates = pd.read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/baseline_covariates_220503.feather")[["eid", "sex_f31_0_0"]].set_index("eid")
# data_covariates.head()
data_covariates = pd.read_feather(f"{experiment_path}/data_covariates_full.feather")[["eid", "sex"]].set_index("eid")

In [30]:
data_outcomes = pd.read_feather(f"{base_path}/data/data_outcomes_long_230320.feather").set_index("eid")
data_outcomes.head()

Unnamed: 0_level_0,endpoint,prev,event,time
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RID00001,OMOP_4306655,0.0,0.0,11.86037
RID00002,OMOP_4306655,0.0,0.0,8.941821
RID00003,OMOP_4306655,0.0,0.0,12.142368
RID00004,OMOP_4306655,0.0,0.0,12.87885
RID00006,OMOP_4306655,0.0,0.0,11.937029


In [31]:
data_all = data_outcomes.merge(data_covariates, left_index=True, right_index=True, how="left").reset_index(drop=False).set_index("endpoint")

In [32]:
data_all.head()

Unnamed: 0_level_0,eid,prev,event,time,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OMOP_4306655,RID00001,0.0,0.0,11.86037,Female
phecode_001,RID00001,0.0,0.0,11.86037,Female
phecode_004,RID00001,0.0,0.0,11.86037,Female
phecode_004-1,RID00001,0.0,0.0,11.86037,Female
phecode_005,RID00001,0.0,0.0,11.86037,Female


In [33]:
data_dict = {e: df.reset_index(drop=True).set_index("eid") for e, df in data_all.groupby('endpoint')}
#data_dict.keys()

In [34]:
endpoints_md

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OMOP_4306655,111,61213.0,3471.0,0.056704,Death,Event,Both
phecode_052,31,61142.0,197.0,0.003222,Herpesvirus,ID,Both
phecode_052-3,28,61175.0,121.0,0.001978,Varicella zoster virus,ID,Both
phecode_052-32,27,61184.0,117.0,0.001912,Herpes zoster,ID,Both
phecode_061,40,61194.0,185.0,0.003023,Influenza virus,ID,Both
...,...,...,...,...,...,...,...
phecode_829,129,60763.0,1980.0,0.032586,Nonspecific findings on examination of blood,Signs/Symptoms,Both
phecode_848,85,61196.0,143.0,0.002337,Nonspecific abnormal findings of other body st...,Signs/Symptoms,Both
phecode_979,21,61120.0,218.0,0.003567,Transplated organ,Stat,Both
phecode_981,122,60297.0,1966.0,0.032605,Family history of malignant neoplasm,Stat,Both


In [35]:
def get_eligible_eids(data_dict, endpoint):

    data_temp = data_dict[endpoint]
    eligibility = endpoints_md.loc[endpoint]["sex"]
    
    if eligibility == "Both": 
        eids_incl = data_temp.copy().query(f"prev==0").index.to_list()
    else:
        eids_incl = data_temp.copy().query(f"prev==0&sex==@eligibility").index.to_list()
        
    return {"endpoint": endpoint, 
            "n_eids": len(eids_incl), 
            "eid_list": eids_incl}

In [36]:
#long_endpoints_unique = data_outcomes['endpoint'].unique()
#endpoints_in_long = [e for e in endpoints if e in long_endpoints_unique]

d_list = [get_eligible_eids(data_dict, endpoint) for endpoint in tqdm(endpoints)] 
eid_df = pd.DataFrame.from_dict(d_list)

  0%|          | 0/367 [00:00<?, ?it/s]

In [37]:
#eid_df.set_index("endpoint")["eid_list"].to_dict()

In [38]:
eid_df.to_feather(f"{experiment_path}/eligible_eids_{today}.feather") 

In [39]:
f"{experiment_path}/eligible_eids_{today}.feather"

'/home/jakobs/data/231117/eligible_eids_231117.feather'

In [40]:
eid_df_long = eid_df[["endpoint", "eid_list"]].explode("eid_list").reset_index(drop=True)
eid_df_long.columns = ["endpoint", "eid"]
eid_df_long["endpoint"] = eid_df_long["endpoint"].astype("category")
eid_df_long["eid"] = eid_df_long["eid"].astype("category")

In [41]:
eid_df_long.to_feather(f"{experiment_path}/eligible_eids_long_{today}.feather")

In [42]:
experiment_path

'/home/jakobs/data/231117'

In [43]:
eid_df_long

Unnamed: 0,endpoint,eid
0,OMOP_4306655,RID00001
1,OMOP_4306655,RID00002
2,OMOP_4306655,RID00003
3,OMOP_4306655,RID00004
4,OMOP_4306655,RID00006
...,...,...
2573258,phecode_997,RID07407
2573259,phecode_997,RID07408
2573260,phecode_997,RID07409
2573261,phecode_997,RID07410
