# Benchmarks

## Initialize

In [None]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '230321'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

In [None]:
data_baseline = pd.read_feather(f"{base_path}/data/data_baseline_230321.feather")
data_baseline

In [None]:
data_outcomes = pd.read_feather(f"{base_path}/data/data_outcomes_wide_230320.feather")

In [None]:
data_baseline = data_baseline.merge(data_outcomes, on="eid", how="left")

In [None]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").set_index("endpoint")
endpoints = endpoints_md.index.to_list()

In [None]:
endpoints_md.query("phecode_string.str.contains('Diabetes')")

In [None]:
data_outcomes

In [None]:
covariate_map = {
    "age3":"age", 
    "sex":"sex", 
    "systol3":"systolic_blood_pressure", 
    "cholesterol3":"cholesterol", 
    "hdl3":"hdl_cholesterol", 
    "phecode_202_prev":"diabetes",
    "bmi3":"bmi",
    "cigstat3":"smoking_status"
}

In [None]:
data_covariates_pre = data_baseline[covariate_map]
data_covariates_pre.columns = [covariate_map[c] for c in data_covariates_pre.columns]

In [None]:
# prepare new variables
data_covariates = data_covariates_pre.copy()
data_covariates["age"] = data_covariates["age"].astype(np.int32)

In [None]:
AgeSex = ["age", "sex"]

SCORE2 = [
    "age", 
    "sex",
    "smoking_status", # current smoker
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",

] 

ASCVD = [
    "age", 
    "sex",
   # "ethnic_background",
    "smoking_status", # current smoker
    "diabetes", # diabetes
    #"antihypertensives", 
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",
] 

# assert, that all variables are available
covariates_scores = sorted(list(set(AgeSex + SCORE2 + ASCVD)))
if not set(covariates_scores).issubset(data_covariates.columns.to_list()):
    print("Not all required covariates are prepared!", list(set(covariates_scores).difference(data_covariates.columns.to_list())))
else:
    print("Success, all required covariates are prepared!")
    data_covariates = data_covariates[covariates_scores]

In [None]:
data_covariates.reset_index().to_feather(f"{experiment_path}/data_covariates_full.feather")