# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '230323'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

In [3]:
data_baseline = pd.read_feather(f"{base_path}/data/data_baseline_230321.feather")
data_baseline

Unnamed: 0,eid,ml_glaucoma_gradability_grad_l,ml_glaucoma_gradability_grad_r,ml_glaucoma_gradability_wdiff_l,ml_glaucoma_gradability_wdiff_r,age3,alcohol3,cigstat3,recruitment_date,systol3,diastol3,mi3,cva3,sex,bmi3,cholesterol3,triglyceride3,hdl3,ldl3
0,RID00001,0.955591,0.760914,0.043725,0.226360,86.127310,2.0,3.0,2007-08-07,138.0,72.5,0.0,0.0,2,33.455266,6.4,1.5,1.53,4.27
1,RID00002,0.986005,0.943931,0.013981,0.054606,70.505133,2.0,3.0,2010-07-08,105.5,68.5,0.0,0.0,2,25.593164,3.9,2.3,1.50,1.40
2,RID00003,0.990022,0.981540,0.009343,0.017813,64.577687,1.0,2.0,2007-04-26,128.5,78.5,0.0,0.0,2,25.372781,3.9,2.1,1.52,1.48
3,RID00004,0.894128,0.945897,0.103611,0.052175,66.740589,1.0,3.0,2006-07-31,154.5,87.5,0.0,0.0,2,27.070312,6.0,1.3,1.80,3.70
4,RID00006,0.940763,0.959411,0.055711,0.039147,72.366872,2.0,3.0,2007-07-10,138.0,66.5,0.0,0.0,2,26.189022,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7243,RID07407,0.940662,0.962702,0.058866,0.037013,73.675565,1.0,3.0,2006-11-03,132.0,73.0,0.0,0.0,1,25.032679,3.9,1.2,1.39,2.01
7244,RID07408,0.719619,0.637359,0.276180,0.355266,64.533881,1.0,3.0,2008-12-18,147.5,81.0,0.0,0.0,2,27.957894,6.8,1.0,1.76,4.64
7245,RID07409,0.635806,0.926931,0.342445,0.072251,82.475017,,,2007-09-27,144.5,80.0,0.0,0.0,2,26.053959,6.4,1.5,1.95,3.85
7246,RID07410,0.861651,0.054378,0.133713,0.028145,82.568104,1.0,2.0,2010-07-21,109.5,63.5,0.0,0.0,1,29.726252,,,,


In [4]:
data_outcomes = pd.read_feather(f"{base_path}/data/data_outcomes_wide_230320.feather")

In [5]:
data_baseline = data_baseline.merge(data_outcomes, on="eid", how="left").set_index("eid")

In [6]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").set_index("endpoint")
endpoints = endpoints_md.index.to_list()

In [7]:
endpoints_md.query("phecode_string.str.contains('Diabetes')")

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
phecode_202,620,57936.0,4259.0,0.073512,202.0,Diabetes mellitus,Endo,Both


In [8]:
data_outcomes

Unnamed: 0,eid,OMOP_4306655_event,OMOP_4306655_time,phecode_001_event,phecode_001_prev,phecode_001_time,phecode_004-1_event,phecode_004-1_prev,phecode_004-1_time,phecode_004_event,...,phecode_979_prev,phecode_979_time,phecode_981_event,phecode_981_prev,phecode_981_time,phecode_984_event,phecode_984_time,phecode_997_event,phecode_997_prev,phecode_997_time
0,RID00001,0.0,11.860370,0.0,0.0,11.860370,0.0,0.0,11.860370,0.0,...,0.0,11.860370,0.0,0.0,11.860370,0.0,11.860370,0.0,0.0,11.860370
1,RID00002,0.0,8.941821,0.0,0.0,8.941821,0.0,0.0,8.941821,0.0,...,0.0,8.941821,0.0,0.0,8.941821,0.0,8.941821,0.0,0.0,8.941821
2,RID00003,0.0,12.142368,0.0,0.0,12.142368,0.0,0.0,12.142368,0.0,...,0.0,12.142368,0.0,0.0,12.142368,0.0,12.142368,0.0,0.0,12.142368
3,RID00004,0.0,12.878850,0.0,0.0,12.878850,0.0,0.0,12.878850,0.0,...,0.0,12.878850,0.0,0.0,12.878850,0.0,12.878850,0.0,0.0,12.878850
4,RID00006,0.0,11.937029,0.0,0.0,11.937029,0.0,0.0,11.937029,0.0,...,0.0,11.937029,0.0,0.0,11.937029,0.0,11.937029,0.0,0.0,11.937029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7243,RID07407,0.0,12.618754,0.0,0.0,12.618754,0.0,0.0,12.618754,0.0,...,0.0,12.618754,0.0,0.0,12.618754,0.0,12.618754,0.0,0.0,12.618754
7244,RID07408,0.0,10.494182,0.0,0.0,10.494182,0.0,0.0,10.494182,0.0,...,0.0,10.494182,0.0,0.0,10.494182,0.0,10.494182,0.0,0.0,10.494182
7245,RID07409,0.0,11.720739,0.0,0.0,11.720739,0.0,0.0,11.720739,0.0,...,0.0,11.720739,0.0,0.0,11.720739,0.0,11.720739,0.0,0.0,11.720739
7246,RID07410,0.0,8.906229,0.0,0.0,8.906229,0.0,0.0,8.906229,0.0,...,0.0,8.906229,0.0,0.0,8.906229,0.0,8.906229,0.0,0.0,8.906229


In [9]:
covariate_map = {
    "age3":"age", 
    "sex":"sex", 
    "systol3":"systolic_blood_pressure", 
    "cholesterol3":"cholesterol", 
    "hdl3":"hdl_cholesterol", 
    "phecode_202_prev":"diabetes",
    "bmi3":"bmi",
    "cigstat3":"smoking_status"
}

In [10]:
data_covariates_pre = data_baseline[covariate_map]
data_covariates_pre.columns = [covariate_map[c] for c in data_covariates_pre.columns]

  data_covariates_pre = data_baseline[covariate_map]


In [11]:
# Copy the original data and prepare new variables
data_covariates = data_covariates_pre.copy()
data_covariates["age"] = data_covariates["age"].astype(np.int32)

# Create dictionaries to replace sex and smoking_status values
sex_dict = {"1": "Male", "2": "Female"}
smoking_dict = {1: "Current", 2: "Former", 3: "Never"}

# Replace sex and smoking_status values with their respective dictionary values
data_covariates["sex"] = data_covariates["sex"].replace(sex_dict).astype("category")
data_covariates["smoking_status"] = data_covariates["smoking_status"].replace(smoking_dict).astype("category")

In [12]:
data_covariates["smoking_status"].unique()

['Never', 'Former', 'Current', NaN]
Categories (3, object): ['Current', 'Former', 'Never']

In [13]:
AgeSex = ["age", "sex"]

SCORE2 = [
    "age", 
    "sex",
    "smoking_status", # current smoker
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",

] 

ASCVD = [
    "age", 
    "sex",
   # "ethnic_background",
    "smoking_status", # current smoker
    "diabetes", # diabetes
    #"antihypertensives", 
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",
] 

# assert, that all variables are available
covariates_scores = sorted(list(set(AgeSex + SCORE2 + ASCVD)))
if not set(covariates_scores).issubset(data_covariates.columns.to_list()):
    print("Not all required covariates are prepared!", list(set(covariates_scores).difference(data_covariates.columns.to_list())))
else:
    print("Success, all required covariates are prepared!")
    data_covariates = data_covariates[covariates_scores]

Success, all required covariates are prepared!


In [14]:
data_covariates.reset_index().to_feather(f"{experiment_path}/data_covariates_full.feather")

In [16]:
data_covariates.dtypes

age                          int32
cholesterol                float64
diabetes                   float64
hdl_cholesterol            float64
sex                         object
smoking_status             float64
systolic_blood_pressure    float64
dtype: object

In [22]:
data_covariates.groupby("sex").mean()

Unnamed: 0_level_0,age,cholesterol,diabetes,hdl_cholesterol,smoking_status,systolic_blood_pressure
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,68.426966,5.008701,0.031211,1.319057,2.345166,136.166199
2,67.15455,5.7053,0.019782,1.657415,2.544999,135.868349
