# Benchmarks

## Initialize

In [None]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = 230425
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

In [None]:
endpoints_md = pd.read_csv(f"{experiment_path}/endpoints.csv")
endpoints = sorted(endpoints_md.endpoint.to_list())

In [None]:
baseline_covariates = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/230424_medicalhistory/baseline_covariates.feather').set_index("eid")

In [None]:
baseline_records = pd.read_feather('/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/baseline_outcomes_220627.feather').set_index("eid")

In [None]:
data_baseline = baseline_covariates.merge(baseline_records, left_index=True, right_index=True, how="left")

In [None]:
data_baseline[[c for c in data_baseline.columns if "fh" in c]]

In [None]:
endpoints_md[endpoints_md.phecode_string.str.contains("erectile")]

In [None]:
endpoints_md[endpoints_md.phecode==705.1]

In [None]:
# process for downstream cox models

In [None]:
covariate_map = {
    'ATC_C02_antihypertensives_0_0': "antihypertensives",
    'ATC_H02_corticosteroids_for_systemic_use_0_0': "corticosteroids",
    'ATC_N05_psycholeptics_0_0': "psycholeptics",
    'age_at_recruitment_f21022_0_0': "age",
    'cholesterol_f30690_0_0': "cholesterol",
    'ethnic_background_f21000_0_0': "ethnic_background",
    'fh_heart_disease_0_0': "fh_heart_disease",
    'hdl_cholesterol_f30760_0_0': "hdl_cholesterol",
    'phecode_202-1_prev': "diabetes1",
    'phecode_202-2_prev': "diabetes2",
    'phecode_202_prev': "diabetes",
    'phecode_286-1_prev': "bipolar_disorder",
    'phecode_286-2_prev': "major_depressive_disorder",
    'phecode_287-1_prev': "schizophrenia",
    'phecode_331-6_prev': "migraine",
    'phecode_416-21_prev': "atrial_fibrillation",
    'phecode_584_prev': "renal_failure",
    'phecode_605-1_prev': "male_erectile_dysfunction",
    'phecode_700-11_prev': "systemic_lupus_erythematosus",
    'phecode_705-1_prev': "rheumatoid_arthritis",
    'sex_f31_0_0': "sex",
    'smoking_status_f20116_0_0': "smoking_status",
    'standing_height_f50_0_0': "height",
    'systolic_blood_pressure_automated_reading_f4080_0_0': "systolic_blood_pressure_0",
    'systolic_blood_pressure_automated_reading_f4080_0_1': "systolic_blood_pressure_1",
    'weight_f21002_0_0': "weight",
    
        # comorbidities
    'phecode_057-1_prev': 'aids',
  'phecode_121_prev': 'leukemia',
  'phecode_122_prev': 'lymphoma',
  'phecode_130_prev': 'solid_tumor',
  'phecode_202_prev': 'diabetes',
  'phecode_328_prev': 'dementia',
    "phecode_436_prev": "atherosclerosis",
  'phecode_341-2_prev': 'hemiplegia',
    "phecode_401_prev": "hypertension",
  'phecode_404-1_prev': 'myocardial_infarction',
  'phecode_424_prev': 'heart_failure',
  'phecode_431_prev': 'stroke_tia',
  'phecode_448_prev': 'pad',
  'phecode_474_prev': 'copd',
  'phecode_513_prev': 'peptic_ulcer',
  'phecode_542_prev': 'chronic_liver_disease',
  'phecode_583_prev': 'chronic_kidney_disease',
    "phecode_557_prev": "gi_hemorrhage",
  'phecode_700_prev': 'connective_tissue_disorder',
}

In [None]:
data_covariates_pre = data_baseline[covariate_map]
data_covariates_pre.columns = [covariate_map[c] for c in data_covariates_pre.columns]

In [None]:
# prepare new variables
data_covariates = data_covariates_pre.copy()
data_covariates["age"] = data_covariates["age"].astype(np.int32)
data_covariates["bmi"] = data_covariates["weight"] / (data_covariates["height"]/100)**2
data_covariates["systolic_blood_pressure"] = pd.concat([data_covariates["systolic_blood_pressure_0"], data_covariates["systolic_blood_pressure_1"]], axis=1).agg(np.mean, 1)

ethn_bg_def = {"White": ["White", "British", "Irish", "Any other white background"],
                "Mixed": ["Mixed", "White and Black Caribbean", "White and Black African", "White and Asian", "Any other mixed background"],  
                "Asian": ["Asian or Asian British", "Indian", "Pakistani", "Bangladeshi", "Any other Asian background"], 
                "Black": ["Black or Black British", "Caribbean", "African", "Any other Black background"],
                "Chinese": ["Chinese"],  
                np.nan: ["Other ethnic group", "Do not know", "Prefer not to answer"]}

ethn_bg_dict = {}
for key, values in ethn_bg_def.items(): 
    for value in values:
        ethn_bg_dict[value]=key 
        
data_covariates["ethnic_background"].replace(ethn_bg_dict, inplace=True)
data_covariates["ethnic_background"] = data_covariates["ethnic_background"].astype("category")

In [None]:
AgeSex = ["age", "sex"]

SCORE2 = [
    "age", 
    "sex",
    "smoking_status", # current smoker
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",

] 

ASCVD = [
    "age", 
    "sex",
    "ethnic_background",
    "smoking_status", # current smoker
    "diabetes", # diabetes
    "antihypertensives", 
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",
] 

QRISK3 = [
    "age", 
    "sex",
    "ethnic_background",
    "smoking_status", # current smoker
    "bmi",
    "diabetes1", # type 1 diabetes
    "diabetes2", # type 1 diabetes
    "fh_heart_disease",
    "renal_failure", # renal failure
    "atrial_fibrillation", # Atrial Fibrillation
    "migraine", # Migraine
    "rheumatoid_arthritis", # Rheumatoid Arthritis
    "systemic_lupus_erythematosus", # SLE
    "schizophrenia", # Schizophrenia
    "bipolar_disorder", # Bipolar disorder
    "major_depressive_disorder", # Major depressive disorder
    "male_erectile_dysfunction", # Male Erectile dysfunction
    "antihypertensives", 
    "corticosteroids",
    "psycholeptics",
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",
]

Comorbidities = [
    'aids',
    'leukemia',
    'lymphoma',
    'solid_tumor',
    'diabetes',
    'dementia',
    'hemiplegia',
    'myocardial_infarction',
    'heart_failure',
    'stroke_tia',
    'pad',
    'copd',
    'peptic_ulcer',
    'chronic_liver_disease',
    'chronic_kidney_disease', 
    'connective_tissue_disorder'
]

# assert, that all variables are available
covariates_scores = sorted(list(set(AgeSex + SCORE2 + ASCVD + QRISK3 + Comorbidities)))
if not set(covariates_scores).issubset(data_covariates.columns.to_list()):
    print("Not all required covariates are prepared!", list(set(covariates_scores).difference(data_covariates.columns.to_list())))
else:
    print("Success, all required covariates are prepared!")
    data_covariates = data_covariates[covariates_scores]

In [None]:
# prepare as float32
data_covariates[data_covariates.select_dtypes(np.float64).columns] = data_covariates.select_dtypes(np.float64).astype(np.float32)

In [None]:
data_covariates

In [None]:
data_covariates.reset_index().to_feather(f"{experiment_path}/data_covariates_full.feather")

In [None]:
data_covariates.dtypes

In [None]:
data_covariates.isna().sum(axis=0)

In [None]:
data_covariates.describe(include="all")