# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '231117'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

In [3]:
data_baseline = pd.read_feather(f"{base_path}/data/data_baseline_230321.feather")
data_baseline

Unnamed: 0,eid,ml_glaucoma_gradability_grad_l,ml_glaucoma_gradability_grad_r,ml_glaucoma_gradability_wdiff_l,ml_glaucoma_gradability_wdiff_r,age3,alcohol3,cigstat3,recruitment_date,systol3,diastol3,mi3,cva3,sex,bmi3,cholesterol3,triglyceride3,hdl3,ldl3
0,RID00001,0.955591,0.760914,0.043725,0.226360,86.127310,2.0,3.0,2007-08-07,138.0,72.5,0.0,0.0,2,33.455266,6.4,1.5,1.53,4.27
1,RID00002,0.986005,0.943931,0.013981,0.054606,70.505133,2.0,3.0,2010-07-08,105.5,68.5,0.0,0.0,2,25.593164,3.9,2.3,1.50,1.40
2,RID00003,0.990022,0.981540,0.009343,0.017813,64.577687,1.0,2.0,2007-04-26,128.5,78.5,0.0,0.0,2,25.372781,3.9,2.1,1.52,1.48
3,RID00004,0.894128,0.945897,0.103611,0.052175,66.740589,1.0,3.0,2006-07-31,154.5,87.5,0.0,0.0,2,27.070312,6.0,1.3,1.80,3.70
4,RID00006,0.940763,0.959411,0.055711,0.039147,72.366872,2.0,3.0,2007-07-10,138.0,66.5,0.0,0.0,2,26.189022,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7243,RID07407,0.940662,0.962702,0.058866,0.037013,73.675565,1.0,3.0,2006-11-03,132.0,73.0,0.0,0.0,1,25.032679,3.9,1.2,1.39,2.01
7244,RID07408,0.719619,0.637359,0.276180,0.355266,64.533881,1.0,3.0,2008-12-18,147.5,81.0,0.0,0.0,2,27.957894,6.8,1.0,1.76,4.64
7245,RID07409,0.635806,0.926931,0.342445,0.072251,82.475017,,,2007-09-27,144.5,80.0,0.0,0.0,2,26.053959,6.4,1.5,1.95,3.85
7246,RID07410,0.861651,0.054378,0.133713,0.028145,82.568104,1.0,2.0,2010-07-21,109.5,63.5,0.0,0.0,1,29.726252,,,,


In [4]:
data_outcomes = pd.read_feather(f"{base_path}/data/data_outcomes_wide_230320.feather")

In [5]:
data_baseline = data_baseline.merge(data_outcomes, on="eid", how="left").set_index("eid")

In [6]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").set_index("endpoint")
endpoints = endpoints_md.index.to_list()

In [7]:
endpoints_md.query("phecode_string.str.contains('Diabetes')")

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode_string,phecode_category
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
phecode_202,620,59873.0,4619.0,0.077147,Diabetes mellitus,Endo


In [8]:
data_outcomes

Unnamed: 0,eid,OMOP_4306655_event,OMOP_4306655_time,phecode_001_event,phecode_001_prev,phecode_001_time,phecode_004-1_event,phecode_004-1_prev,phecode_004-1_time,phecode_004_event,...,phecode_979_prev,phecode_979_time,phecode_981_event,phecode_981_prev,phecode_981_time,phecode_984_event,phecode_984_time,phecode_997_event,phecode_997_prev,phecode_997_time
0,RID00001,0.0,11.860370,0.0,0.0,11.860370,0.0,0.0,11.860370,0.0,...,0.0,11.860370,0.0,0.0,11.860370,0.0,11.860370,0.0,0.0,11.860370
1,RID00002,0.0,8.941821,0.0,0.0,8.941821,0.0,0.0,8.941821,0.0,...,0.0,8.941821,0.0,0.0,8.941821,0.0,8.941821,0.0,0.0,8.941821
2,RID00003,0.0,12.142368,0.0,0.0,12.142368,0.0,0.0,12.142368,0.0,...,0.0,12.142368,0.0,0.0,12.142368,0.0,12.142368,0.0,0.0,12.142368
3,RID00004,0.0,12.878850,0.0,0.0,12.878850,0.0,0.0,12.878850,0.0,...,0.0,12.878850,0.0,0.0,12.878850,0.0,12.878850,0.0,0.0,12.878850
4,RID00006,0.0,11.937029,0.0,0.0,11.937029,0.0,0.0,11.937029,0.0,...,0.0,11.937029,0.0,0.0,11.937029,0.0,11.937029,0.0,0.0,11.937029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7243,RID07407,0.0,12.618754,0.0,0.0,12.618754,0.0,0.0,12.618754,0.0,...,0.0,12.618754,0.0,0.0,12.618754,0.0,12.618754,0.0,0.0,12.618754
7244,RID07408,0.0,10.494182,0.0,0.0,10.494182,0.0,0.0,10.494182,0.0,...,0.0,10.494182,0.0,0.0,10.494182,0.0,10.494182,0.0,0.0,10.494182
7245,RID07409,0.0,11.720739,0.0,0.0,11.720739,0.0,0.0,11.720739,0.0,...,0.0,11.720739,0.0,0.0,11.720739,0.0,11.720739,0.0,0.0,11.720739
7246,RID07410,0.0,8.906229,0.0,0.0,8.906229,0.0,0.0,8.906229,0.0,...,0.0,8.906229,0.0,0.0,8.906229,0.0,8.906229,0.0,0.0,8.906229


In [9]:
covariate_map = {
    "age3":"age", 
    "sex":"sex", 
    "systol3":"systolic_blood_pressure", 
    "cholesterol3":"cholesterol", 
    "hdl3":"hdl_cholesterol", 
    "phecode_202_prev":"diabetes",
    "bmi3":"bmi",
    "cigstat3":"smoking_status"
}

In [10]:
data_covariates_pre = data_baseline[covariate_map]
data_covariates_pre.columns = [covariate_map[c] for c in data_covariates_pre.columns]

  data_covariates_pre = data_baseline[covariate_map]


In [11]:
# Copy the original data and prepare new variables
data_covariates = data_covariates_pre.copy()
data_covariates["age"] = data_covariates["age"].astype(np.int32)

# Create dictionaries to replace sex and smoking_status values
sex_dict = {"1": "Male", "2": "Female"}
smoking_dict = {1: "Current", 2: "Former", 3: "Never"}

# Replace sex and smoking_status values with their respective dictionary values
data_covariates["sex"] = data_covariates["sex"].replace(sex_dict).astype("category")
data_covariates["smoking_status"] = data_covariates["smoking_status"].replace(smoking_dict).astype("category")

In [12]:
data_covariates["smoking_status"].unique()

['Never', 'Former', 'Current', NaN]
Categories (3, object): ['Current', 'Former', 'Never']

In [13]:
AgeSex = ["age", "sex"]

SCORE2 = [
    "age", 
    "sex",
    "smoking_status", # current smoker
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",

] 

ASCVD = [
    "age", 
    "sex",
   # "ethnic_background",
    "smoking_status", # current smoker
    "diabetes", # diabetes
    #"antihypertensives", 
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",
] 

# assert, that all variables are available
covariates_scores = sorted(list(set(AgeSex + SCORE2 + ASCVD)))
if not set(covariates_scores).issubset(data_covariates.columns.to_list()):
    print("Not all required covariates are prepared!", list(set(covariates_scores).difference(data_covariates.columns.to_list())))
else:
    print("Success, all required covariates are prepared!")
    data_covariates = data_covariates[covariates_scores]

Success, all required covariates are prepared!


In [14]:
data_covariates.reset_index().to_feather(f"{experiment_path}/data_covariates_full.feather")

In [15]:
data_covariates.dtypes

age                           int32
cholesterol                 float64
diabetes                    float64
hdl_cholesterol             float64
sex                        category
smoking_status             category
systolic_blood_pressure     float64
dtype: object

In [16]:
data_covariates.groupby("sex").mean()

  data_covariates.groupby("sex").mean()


Unnamed: 0_level_0,age,cholesterol,diabetes,hdl_cholesterol,systolic_blood_pressure
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,67.15455,5.7053,0.019782,1.657415,135.868349
Male,68.426966,5.008701,0.031211,1.319057,136.166199


In [17]:
len(['cholesterol', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure', 'OMOP_4306655', 'phecode_052', 'phecode_052-32', 'phecode_061', 'phecode_070', 'phecode_089', 'phecode_089-1', 'phecode_089-2', 'phecode_089-3', 'phecode_092', 'phecode_092-2', 'phecode_098', 'phecode_100', 'phecode_101', 'phecode_101-1', 'phecode_101-2', 'phecode_101-4', 'phecode_101-41', 'phecode_101-42', 'phecode_101-8', 'phecode_102', 'phecode_102-1', 'phecode_103', 'phecode_103-1', 'phecode_103-3', 'phecode_104', 'phecode_105', 'phecode_106', 'phecode_106-2', 'phecode_106-21', 'phecode_106-3', 'phecode_107', 'phecode_107-2', 'phecode_108', 'phecode_108-4', 'phecode_108-41', 'phecode_108-5', 'phecode_112', 'phecode_116', 'phecode_116-1', 'phecode_120', 'phecode_120-1', 'phecode_120-2', 'phecode_120-21', 'phecode_121', 'phecode_122', 'phecode_122-2', 'phecode_130', 'phecode_135', 'phecode_135-5', 'phecode_136', 'phecode_138', 'phecode_138-2', 'phecode_139', 'phecode_139-5', 'phecode_139-6', 'phecode_144', 'phecode_144-2', 'phecode_144-21', 'phecode_149', 'phecode_160', 'phecode_160-1', 'phecode_164', 'phecode_164-1', 'phecode_168', 'phecode_168-1', 'phecode_170', 'phecode_170-1', 'phecode_177', 'phecode_177-2', 'phecode_181', 'phecode_200', 'phecode_200-1', 'phecode_200-3', 'phecode_202', 'phecode_202-1', 'phecode_202-2', 'phecode_208', 'phecode_211', 'phecode_232', 'phecode_232-2', 'phecode_232-4', 'phecode_236', 'phecode_244', 'phecode_247', 'phecode_247-7', 'phecode_247-72', 'phecode_256', 'phecode_256-7', 'phecode_280-1', 'phecode_281', 'phecode_281-2', 'phecode_282', 'phecode_282-1', 'phecode_283', 'phecode_283-8', 'phecode_286', 'phecode_286-1', 'phecode_286-2', 'phecode_288', 'phecode_308', 'phecode_324', 'phecode_324-1', 'phecode_325', 'phecode_325-2', 'phecode_326', 'phecode_326-1', 'phecode_327', 'phecode_328', 'phecode_328-1', 'phecode_328-7', 'phecode_328-9', 'phecode_329', 'phecode_329-9', 'phecode_330', 'phecode_330-3', 'phecode_331', 'phecode_331-6', 'phecode_331-8', 'phecode_333', 'phecode_334', 'phecode_335', 'phecode_336', 'phecode_337', 'phecode_341', 'phecode_341-2', 'phecode_347', 'phecode_349', 'phecode_350', 'phecode_351', 'phecode_354', 'phecode_355', 'phecode_356', 'phecode_360', 'phecode_363', 'phecode_367', 'phecode_369', 'phecode_371', 'phecode_374', 'phecode_374-3', 'phecode_374-4', 'phecode_375', 'phecode_376', 'phecode_386', 'phecode_387', 'phecode_388', 'phecode_390', 'phecode_391', 'phecode_394', 'phecode_396', 'phecode_400', 'phecode_400-2', 'phecode_401', 'phecode_401-1', 'phecode_401-3', 'phecode_403', 'phecode_404', 'phecode_404-1', 'phecode_404-11', 'phecode_406', 'phecode_410', 'phecode_410-2', 'phecode_411', 'phecode_413', 'phecode_413-1', 'phecode_413-2', 'phecode_413-3', 'phecode_414', 'phecode_416', 'phecode_416-1', 'phecode_416-2', 'phecode_416-4', 'phecode_417', 'phecode_420', 'phecode_423', 'phecode_424', 'phecode_426', 'phecode_430', 'phecode_431', 'phecode_431-1', 'phecode_431-11', 'phecode_431-12', 'phecode_433', 'phecode_433-2', 'phecode_436', 'phecode_437', 'phecode_438', 'phecode_439', 'phecode_440', 'phecode_440-1', 'phecode_440-13', 'phecode_440-3', 'phecode_443', 'phecode_444', 'phecode_444-1', 'phecode_444-11', 'phecode_446', 'phecode_448', 'phecode_460', 'phecode_460-1', 'phecode_460-2', 'phecode_462', 'phecode_462-2', 'phecode_468', 'phecode_469', 'phecode_471', 'phecode_471-5', 'phecode_472', 'phecode_474', 'phecode_475', 'phecode_476', 'phecode_479', 'phecode_479-3', 'phecode_481', 'phecode_483', 'phecode_486', 'phecode_487', 'phecode_488', 'phecode_491', 'phecode_495', 'phecode_501', 'phecode_502', 'phecode_503', 'phecode_507', 'phecode_509', 'phecode_510', 'phecode_510-2', 'phecode_511', 'phecode_512', 'phecode_513', 'phecode_513-2', 'phecode_513-3', 'phecode_514', 'phecode_516', 'phecode_518', 'phecode_520', 'phecode_520-1', 'phecode_520-11', 'phecode_520-13', 'phecode_520-14', 'phecode_520-2', 'phecode_522', 'phecode_522-1', 'phecode_522-11', 'phecode_522-12', 'phecode_522-9', 'phecode_523', 'phecode_524', 'phecode_524-1', 'phecode_525', 'phecode_526', 'phecode_526-2', 'phecode_527', 'phecode_528', 'phecode_528-1', 'phecode_528-2', 'phecode_529', 'phecode_529-2', 'phecode_529-3', 'phecode_530', 'phecode_532', 'phecode_537', 'phecode_542', 'phecode_542-1', 'phecode_546', 'phecode_550', 'phecode_550-1', 'phecode_550-2', 'phecode_552', 'phecode_554', 'phecode_554-1', 'phecode_554-11', 'phecode_555', 'phecode_556', 'phecode_580', 'phecode_581', 'phecode_581-3', 'phecode_582', 'phecode_583', 'phecode_584', 'phecode_585', 'phecode_586', 'phecode_592', 'phecode_593', 'phecode_594', 'phecode_594-1', 'phecode_594-3', 'phecode_594-4', 'phecode_596', 'phecode_597', 'phecode_597-1', 'phecode_599', 'phecode_600', 'phecode_602', 'phecode_603', 'phecode_603-1', 'phecode_604', 'phecode_619', 'phecode_622', 'phecode_627', 'phecode_660', 'phecode_664', 'phecode_665', 'phecode_670', 'phecode_673', 'phecode_678', 'phecode_679', 'phecode_679-1', 'phecode_680', 'phecode_682', 'phecode_686', 'phecode_686-1', 'phecode_686-2', 'phecode_688', 'phecode_700', 'phecode_702', 'phecode_703', 'phecode_703-1', 'phecode_703-11', 'phecode_704', 'phecode_705', 'phecode_705-1', 'phecode_706', 'phecode_707', 'phecode_708', 'phecode_708-1', 'phecode_708-7', 'phecode_709', 'phecode_711', 'phecode_713', 'phecode_714', 'phecode_714-3', 'phecode_715', 'phecode_716', 'phecode_718', 'phecode_719', 'phecode_721', 'phecode_721-1', 'phecode_722', 'phecode_726', 'phecode_726-1', 'phecode_726-2', 'phecode_727', 'phecode_800', 'phecode_801', 'phecode_805', 'phecode_807', 'phecode_808', 'phecode_812', 'phecode_815', 'phecode_819', 'phecode_829', 'phecode_848', 'phecode_979', 'phecode_981', 'phecode_997'])

360