# Benchmarks

## Initialize

In [2]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib

import numpy as np
import pandas as pd
import lifelines
import pandas as pd

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
import shutil

import plotly.express as px
import plotly.graph_objects as go
from plotly.graph_objects import Box

import matplotlib.pyplot as plt
from lifelines import CRCSplineFitter
import warnings
from lifelines.utils import CensoringType

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

from IPython.display import clear_output
import pathlib

from lifelines.utils import concordance_index

In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=1, threads_per_worker=10)
client = Client(cluster)

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_name = "210714_metabolomics"
#data_path = "/data/analysis/ag-reils/steinfej"
data_pre = f"{base_path}/data/2_datasets_pre/{project_name}"
data_post = f"{base_path}/data/3_datasets_post/{project_name}"

project_label = "21_metabolomics_multitask"
project_path = f"{base_path}/results/projects/{project_label}"
figures_path = f"{project_path}/figures"
data_results_path = f"{project_path}/data"
pathlib.Path(figures_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(data_results_path).mkdir(parents=True, exist_ok=True)

In [5]:
endpoints = [
    #Cardiovascular
    'M_MACE',
    'M_coronary_heart_disease',
    'M_cerebral_stroke',
    'M_peripheral_arterial_disease',
    'M_atrial_fibrillation',
    'M_heart_failure',
    'M_abdominal_aortic_aneurysm',
    'M_venous_thrombosis',
    
    # General IM
    'M_type_2_diabetes',
    'M_liver_disease',
    'M_renal_disease',
    
    # Pulmological
    'M_asthma', 
    'M_chronic_obstructuve_pulmonary_disease',  
    
    # Psychiatric/Neurological
    'M_all_cause_dementia',
    'M_parkinsons_disease',   
    
    # Cancers
    "M_lung_cancer",
    "M_non_melanoma_skin_cancer",
    "M_colon_cancer",
    "M_rectal_cancer",
    "M_prostate_cancer",
    "M_breast_cancer",
    
    # Ophtalmological
    'M_cataracts', 
    'M_glaucoma',
    
    # Traumatology
    'M_fractures',
]

In [6]:
partitions = [str(p) for p in range(0, 22)]
splits = ["train", "valid", "test"]

In [7]:
run = "220126"

In [None]:
!ls {data_results_path}

### Load models

In [9]:
project="CardioRS/metabolomics"
#modules = ["CoxPH", "DeepHit", "DeepSurvivalMachine"]
task = "MultiTaskSurvivalTask"
survival_task = "DeepSurvivalMachine"
modules = ["DeepSurvivalMachine"]
datamodules = ["UKBBSurvivalDatamodule"]
#feature_sets = ["AgeSex", "Metabolomics", "AgeSexMetabolomics"]
tag="211006_ds_multitask_cancer_24"
#tag="TaskSpecificExclusion_Debug!"
#https://docs.wandb.ai/ref/app/features/panels/code
import neptune.new as neptune
project = neptune.get_project(project, api_token)
df = project.fetch_runs_table(tag=tag).to_pandas()

In [10]:
log_path = f"{base_path}/results/models/NeptuneLogger"

In [11]:
from pathlib import Path
pred_paths= [f"{log_path}/{Path(*Path(p).parts[-3:])}" for p in df["prediction_path"].to_list() if p==p]

In [None]:
import joblib
def get_df(path): return pd.read_feather(path)#return pd.read_csv(f"{path[:-8]}.csv", index_col=0)
print("Status: ", sum(x is not None for x in pred_paths)/len(pred_paths))
with joblib.parallel_backend('dask'):
    dfs = Parallel(n_jobs=80)(delayed(get_df)(path) for path in tqdm(pred_paths) if path is not None if not pd.isna(path))

In [13]:
preds_all = pd.concat(dfs, axis=0).reset_index(drop=True)

In [None]:
preds_all.query("split=='test'")

In [15]:
#preds_all.to_feather(f"{data_results_path}/preds_all.feather")

In [16]:
def get_feature_string(d):
    l = []
    for k, v in eval(d).items():
        for i, j in v.items():
            for m in j:
                if not m in l: l.extend(j)
    return repr(l)

def clean_df(df, predictions):
   #feature_df = df[[\"parameters/feature_set\", \"parameters/features_yaml\"]].drop_duplicates().dropna()\n",
   #feature_map ={row[\"parameters/features_yaml\"]: row[\"parameters/feature_set\"] for i, row in feature_df.iterrows()}\n",
    predictions["features"] = predictions["features_names"].map(feature_map)
    return predictions

In [17]:
def clean_df(df, predictions):
#feature_df = df[[\"parameters/feature_set\", \"parameters/features_yaml\"]].drop_duplicates().dropna()\n",
    feature_map ={"['age_at_recruitment', 'sex', 'NMR_3hydroxybutyrate', 'NMR_acetate', 'NMR_acetoacetate', 'NMR_acetone', 'NMR_alanine', 'NMR_albumin', 'NMR_apolipoprotein_a1', 'NMR_apolipoprotein_b', 'NMR_average_diameter_for_hdl_particles', 'NMR_average_diameter_for_ldl_particles', 'NMR_average_diameter_for_vldl_particles', 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesterol_in_idl', 'NMR_cholesterol_in_large_hdl', 'NMR_cholesterol_in_large_ldl', 'NMR_cholesterol_in_large_vldl', 'NMR_cholesterol_in_medium_hdl', 'NMR_cholesterol_in_medium_ldl', 'NMR_cholesterol_in_medium_vldl', 'NMR_cholesterol_in_small_hdl', 'NMR_cholesterol_in_small_ldl', 'NMR_cholesterol_in_small_vldl', 'NMR_cholesterol_in_very_large_hdl', 'NMR_cholesterol_in_very_large_vldl', 'NMR_cholesterol_in_very_small_vldl', 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesteryl_esters_in_hdl', 'NMR_cholesteryl_esters_in_idl', 'NMR_cholesteryl_esters_in_ldl', 'NMR_cholesteryl_esters_in_large_hdl', 'NMR_cholesteryl_esters_in_large_ldl', 'NMR_cholesteryl_esters_in_large_vldl', 'NMR_cholesteryl_esters_in_medium_hdl', 'NMR_cholesteryl_esters_in_medium_ldl', 'NMR_cholesteryl_esters_in_medium_vldl', 'NMR_cholesteryl_esters_in_small_hdl', 'NMR_cholesteryl_esters_in_small_ldl', 'NMR_cholesteryl_esters_in_small_vldl', 'NMR_cholesteryl_esters_in_vldl', 'NMR_cholesteryl_esters_in_very_large_hdl', 'NMR_cholesteryl_esters_in_very_large_vldl', 'NMR_cholesteryl_esters_in_very_small_vldl', 'NMR_citrate', 'NMR_clinical_ldl_cholesterol', 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles', 'NMR_concentration_of_hdl_particles', 'NMR_concentration_of_idl_particles', 'NMR_concentration_of_ldl_particles', 'NMR_concentration_of_large_hdl_particles', 'NMR_concentration_of_large_ldl_particles', 'NMR_concentration_of_large_vldl_particles', 'NMR_concentration_of_medium_hdl_particles', 'NMR_concentration_of_medium_ldl_particles', 'NMR_concentration_of_medium_vldl_particles', 'NMR_concentration_of_small_hdl_particles', 'NMR_concentration_of_small_ldl_particles', 'NMR_concentration_of_small_vldl_particles', 'NMR_concentration_of_vldl_particles', 'NMR_concentration_of_very_large_hdl_particles', 'NMR_concentration_of_very_large_vldl_particles', 'NMR_concentration_of_very_small_vldl_particles', 'NMR_creatinine', 'NMR_degree_of_unsaturation', 'NMR_docosahexaenoic_acid', 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_free_cholesterol_in_hdl', 'NMR_free_cholesterol_in_idl', 'NMR_free_cholesterol_in_ldl', 'NMR_free_cholesterol_in_large_hdl', 'NMR_free_cholesterol_in_large_ldl', 'NMR_free_cholesterol_in_large_vldl', 'NMR_free_cholesterol_in_medium_hdl', 'NMR_free_cholesterol_in_medium_ldl', 'NMR_free_cholesterol_in_medium_vldl', 'NMR_free_cholesterol_in_small_hdl', 'NMR_free_cholesterol_in_small_ldl', 'NMR_free_cholesterol_in_small_vldl', 'NMR_free_cholesterol_in_vldl', 'NMR_free_cholesterol_in_very_large_hdl', 'NMR_free_cholesterol_in_very_large_vldl', 'NMR_free_cholesterol_in_very_small_vldl', 'NMR_glucose', 'NMR_glutamine', 'NMR_glycine', 'NMR_glycoprotein_acetyls', 'NMR_hdl_cholesterol', 'NMR_histidine', 'NMR_isoleucine', 'NMR_ldl_cholesterol', 'NMR_lactate', 'NMR_leucine', 'NMR_linoleic_acid', 'NMR_monounsaturated_fatty_acids', 'NMR_omega3_fatty_acids', 'NMR_omega6_fatty_acids', 'NMR_phenylalanine', 'NMR_phosphatidylcholines', 'NMR_phosphoglycerides', 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_phospholipids_in_hdl', 'NMR_phospholipids_in_idl', 'NMR_phospholipids_in_ldl', 'NMR_phospholipids_in_large_hdl', 'NMR_phospholipids_in_large_ldl', 'NMR_phospholipids_in_large_vldl', 'NMR_phospholipids_in_medium_hdl', 'NMR_phospholipids_in_medium_ldl', 'NMR_phospholipids_in_medium_vldl', 'NMR_phospholipids_in_small_hdl', 'NMR_phospholipids_in_small_ldl', 'NMR_phospholipids_in_small_vldl', 'NMR_phospholipids_in_vldl', 'NMR_phospholipids_in_very_large_hdl', 'NMR_phospholipids_in_very_large_vldl', 'NMR_phospholipids_in_very_small_vldl', 'NMR_polyunsaturated_fatty_acids', 'NMR_pyruvate', 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol', 'NMR_saturated_fatty_acids', 'NMR_sphingomyelins', 'NMR_total_cholesterol', 'NMR_total_cholesterol_minus_hdlc', 'NMR_total_cholines', 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine', 'NMR_total_concentration_of_lipoprotein_particles', 'NMR_total_esterified_cholesterol', 'NMR_total_fatty_acids', 'NMR_total_free_cholesterol', 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_total_lipids_in_hdl', 'NMR_total_lipids_in_idl', 'NMR_total_lipids_in_ldl', 'NMR_total_lipids_in_large_hdl', 'NMR_total_lipids_in_large_ldl', 'NMR_total_lipids_in_large_vldl', 'NMR_total_lipids_in_lipoprotein_particles', 'NMR_total_lipids_in_medium_hdl', 'NMR_total_lipids_in_medium_ldl', 'NMR_total_lipids_in_medium_vldl', 'NMR_total_lipids_in_small_hdl', 'NMR_total_lipids_in_small_ldl', 'NMR_total_lipids_in_small_vldl', 'NMR_total_lipids_in_vldl', 'NMR_total_lipids_in_very_large_hdl', 'NMR_total_lipids_in_very_large_vldl', 'NMR_total_lipids_in_very_small_vldl', 'NMR_total_phospholipids_in_lipoprotein_particles', 'NMR_total_triglycerides', 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl', 'NMR_triglycerides_in_hdl', 'NMR_triglycerides_in_idl', 'NMR_triglycerides_in_ldl', 'NMR_triglycerides_in_large_hdl', 'NMR_triglycerides_in_large_ldl', 'NMR_triglycerides_in_large_vldl', 'NMR_triglycerides_in_medium_hdl', 'NMR_triglycerides_in_medium_ldl', 'NMR_triglycerides_in_medium_vldl', 'NMR_triglycerides_in_small_hdl', 'NMR_triglycerides_in_small_ldl', 'NMR_triglycerides_in_small_vldl', 'NMR_triglycerides_in_vldl', 'NMR_triglycerides_in_very_large_hdl', 'NMR_triglycerides_in_very_large_vldl', 'NMR_triglycerides_in_very_small_vldl', 'NMR_tyrosine', 'NMR_vldl_cholesterol', 'NMR_valine']":"AgeSexMetabolomics",
                  "['NMR_3hydroxybutyrate', 'NMR_acetate', 'NMR_acetoacetate', 'NMR_acetone', 'NMR_alanine', 'NMR_albumin', 'NMR_apolipoprotein_a1', 'NMR_apolipoprotein_b', 'NMR_average_diameter_for_hdl_particles', 'NMR_average_diameter_for_ldl_particles', 'NMR_average_diameter_for_vldl_particles', 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesterol_in_idl', 'NMR_cholesterol_in_large_hdl', 'NMR_cholesterol_in_large_ldl', 'NMR_cholesterol_in_large_vldl', 'NMR_cholesterol_in_medium_hdl', 'NMR_cholesterol_in_medium_ldl', 'NMR_cholesterol_in_medium_vldl', 'NMR_cholesterol_in_small_hdl', 'NMR_cholesterol_in_small_ldl', 'NMR_cholesterol_in_small_vldl', 'NMR_cholesterol_in_very_large_hdl', 'NMR_cholesterol_in_very_large_vldl', 'NMR_cholesterol_in_very_small_vldl', 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesteryl_esters_in_hdl', 'NMR_cholesteryl_esters_in_idl', 'NMR_cholesteryl_esters_in_ldl', 'NMR_cholesteryl_esters_in_large_hdl', 'NMR_cholesteryl_esters_in_large_ldl', 'NMR_cholesteryl_esters_in_large_vldl', 'NMR_cholesteryl_esters_in_medium_hdl', 'NMR_cholesteryl_esters_in_medium_ldl', 'NMR_cholesteryl_esters_in_medium_vldl', 'NMR_cholesteryl_esters_in_small_hdl', 'NMR_cholesteryl_esters_in_small_ldl', 'NMR_cholesteryl_esters_in_small_vldl', 'NMR_cholesteryl_esters_in_vldl', 'NMR_cholesteryl_esters_in_very_large_hdl', 'NMR_cholesteryl_esters_in_very_large_vldl', 'NMR_cholesteryl_esters_in_very_small_vldl', 'NMR_citrate', 'NMR_clinical_ldl_cholesterol', 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles', 'NMR_concentration_of_hdl_particles', 'NMR_concentration_of_idl_particles', 'NMR_concentration_of_ldl_particles', 'NMR_concentration_of_large_hdl_particles', 'NMR_concentration_of_large_ldl_particles', 'NMR_concentration_of_large_vldl_particles', 'NMR_concentration_of_medium_hdl_particles', 'NMR_concentration_of_medium_ldl_particles', 'NMR_concentration_of_medium_vldl_particles', 'NMR_concentration_of_small_hdl_particles', 'NMR_concentration_of_small_ldl_particles', 'NMR_concentration_of_small_vldl_particles', 'NMR_concentration_of_vldl_particles', 'NMR_concentration_of_very_large_hdl_particles', 'NMR_concentration_of_very_large_vldl_particles', 'NMR_concentration_of_very_small_vldl_particles', 'NMR_creatinine', 'NMR_degree_of_unsaturation', 'NMR_docosahexaenoic_acid', 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_free_cholesterol_in_hdl', 'NMR_free_cholesterol_in_idl', 'NMR_free_cholesterol_in_ldl', 'NMR_free_cholesterol_in_large_hdl', 'NMR_free_cholesterol_in_large_ldl', 'NMR_free_cholesterol_in_large_vldl', 'NMR_free_cholesterol_in_medium_hdl', 'NMR_free_cholesterol_in_medium_ldl', 'NMR_free_cholesterol_in_medium_vldl', 'NMR_free_cholesterol_in_small_hdl', 'NMR_free_cholesterol_in_small_ldl', 'NMR_free_cholesterol_in_small_vldl', 'NMR_free_cholesterol_in_vldl', 'NMR_free_cholesterol_in_very_large_hdl', 'NMR_free_cholesterol_in_very_large_vldl', 'NMR_free_cholesterol_in_very_small_vldl', 'NMR_glucose', 'NMR_glutamine', 'NMR_glycine', 'NMR_glycoprotein_acetyls', 'NMR_hdl_cholesterol', 'NMR_histidine', 'NMR_isoleucine', 'NMR_ldl_cholesterol', 'NMR_lactate', 'NMR_leucine', 'NMR_linoleic_acid', 'NMR_monounsaturated_fatty_acids', 'NMR_omega3_fatty_acids', 'NMR_omega6_fatty_acids', 'NMR_phenylalanine', 'NMR_phosphatidylcholines', 'NMR_phosphoglycerides', 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_phospholipids_in_hdl', 'NMR_phospholipids_in_idl', 'NMR_phospholipids_in_ldl', 'NMR_phospholipids_in_large_hdl', 'NMR_phospholipids_in_large_ldl', 'NMR_phospholipids_in_large_vldl', 'NMR_phospholipids_in_medium_hdl', 'NMR_phospholipids_in_medium_ldl', 'NMR_phospholipids_in_medium_vldl', 'NMR_phospholipids_in_small_hdl', 'NMR_phospholipids_in_small_ldl', 'NMR_phospholipids_in_small_vldl', 'NMR_phospholipids_in_vldl', 'NMR_phospholipids_in_very_large_hdl', 'NMR_phospholipids_in_very_large_vldl', 'NMR_phospholipids_in_very_small_vldl', 'NMR_polyunsaturated_fatty_acids', 'NMR_pyruvate', 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol', 'NMR_saturated_fatty_acids', 'NMR_sphingomyelins', 'NMR_total_cholesterol', 'NMR_total_cholesterol_minus_hdlc', 'NMR_total_cholines', 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine', 'NMR_total_concentration_of_lipoprotein_particles', 'NMR_total_esterified_cholesterol', 'NMR_total_fatty_acids', 'NMR_total_free_cholesterol', 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_total_lipids_in_hdl', 'NMR_total_lipids_in_idl', 'NMR_total_lipids_in_ldl', 'NMR_total_lipids_in_large_hdl', 'NMR_total_lipids_in_large_ldl', 'NMR_total_lipids_in_large_vldl', 'NMR_total_lipids_in_lipoprotein_particles', 'NMR_total_lipids_in_medium_hdl', 'NMR_total_lipids_in_medium_ldl', 'NMR_total_lipids_in_medium_vldl', 'NMR_total_lipids_in_small_hdl', 'NMR_total_lipids_in_small_ldl', 'NMR_total_lipids_in_small_vldl', 'NMR_total_lipids_in_vldl', 'NMR_total_lipids_in_very_large_hdl', 'NMR_total_lipids_in_very_large_vldl', 'NMR_total_lipids_in_very_small_vldl', 'NMR_total_phospholipids_in_lipoprotein_particles', 'NMR_total_triglycerides', 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl', 'NMR_triglycerides_in_hdl', 'NMR_triglycerides_in_idl', 'NMR_triglycerides_in_ldl', 'NMR_triglycerides_in_large_hdl', 'NMR_triglycerides_in_large_ldl', 'NMR_triglycerides_in_large_vldl', 'NMR_triglycerides_in_medium_hdl', 'NMR_triglycerides_in_medium_ldl', 'NMR_triglycerides_in_medium_vldl', 'NMR_triglycerides_in_small_hdl', 'NMR_triglycerides_in_small_ldl', 'NMR_triglycerides_in_small_vldl', 'NMR_triglycerides_in_vldl', 'NMR_triglycerides_in_very_large_hdl', 'NMR_triglycerides_in_very_large_vldl', 'NMR_triglycerides_in_very_small_vldl', 'NMR_tyrosine', 'NMR_vldl_cholesterol', 'NMR_valine']":"Metabolomics"}
    predictions["features"] = predictions["feature_names"].map(feature_map)
    return predictions

In [18]:
preds_all_cleaned = clean_df(df, preds_all)

In [19]:
def clean_df(df, predictions):
    #feature_df = df[["parameters/feature_set", "parameters/features_yaml"]].drop_duplicates().dropna()
    feature_map ={"['age_at_recruitment', 'sex', 'NMR_3hydroxybutyrate', 'NMR_acetate', 'NMR_acetoacetate', 'NMR_acetone', 'NMR_alanine', 'NMR_albumin', 'NMR_apolipoprotein_a1', 'NMR_apolipoprotein_b', 'NMR_average_diameter_for_hdl_particles', 'NMR_average_diameter_for_ldl_particles', 'NMR_average_diameter_for_vldl_particles', 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesterol_in_idl', 'NMR_cholesterol_in_large_hdl', 'NMR_cholesterol_in_large_ldl', 'NMR_cholesterol_in_large_vldl', 'NMR_cholesterol_in_medium_hdl', 'NMR_cholesterol_in_medium_ldl', 'NMR_cholesterol_in_medium_vldl', 'NMR_cholesterol_in_small_hdl', 'NMR_cholesterol_in_small_ldl', 'NMR_cholesterol_in_small_vldl', 'NMR_cholesterol_in_very_large_hdl', 'NMR_cholesterol_in_very_large_vldl', 'NMR_cholesterol_in_very_small_vldl', 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesteryl_esters_in_hdl', 'NMR_cholesteryl_esters_in_idl', 'NMR_cholesteryl_esters_in_ldl', 'NMR_cholesteryl_esters_in_large_hdl', 'NMR_cholesteryl_esters_in_large_ldl', 'NMR_cholesteryl_esters_in_large_vldl', 'NMR_cholesteryl_esters_in_medium_hdl', 'NMR_cholesteryl_esters_in_medium_ldl', 'NMR_cholesteryl_esters_in_medium_vldl', 'NMR_cholesteryl_esters_in_small_hdl', 'NMR_cholesteryl_esters_in_small_ldl', 'NMR_cholesteryl_esters_in_small_vldl', 'NMR_cholesteryl_esters_in_vldl', 'NMR_cholesteryl_esters_in_very_large_hdl', 'NMR_cholesteryl_esters_in_very_large_vldl', 'NMR_cholesteryl_esters_in_very_small_vldl', 'NMR_citrate', 'NMR_clinical_ldl_cholesterol', 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles', 'NMR_concentration_of_hdl_particles', 'NMR_concentration_of_idl_particles', 'NMR_concentration_of_ldl_particles', 'NMR_concentration_of_large_hdl_particles', 'NMR_concentration_of_large_ldl_particles', 'NMR_concentration_of_large_vldl_particles', 'NMR_concentration_of_medium_hdl_particles', 'NMR_concentration_of_medium_ldl_particles', 'NMR_concentration_of_medium_vldl_particles', 'NMR_concentration_of_small_hdl_particles', 'NMR_concentration_of_small_ldl_particles', 'NMR_concentration_of_small_vldl_particles', 'NMR_concentration_of_vldl_particles', 'NMR_concentration_of_very_large_hdl_particles', 'NMR_concentration_of_very_large_vldl_particles', 'NMR_concentration_of_very_small_vldl_particles', 'NMR_creatinine', 'NMR_degree_of_unsaturation', 'NMR_docosahexaenoic_acid', 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_free_cholesterol_in_hdl', 'NMR_free_cholesterol_in_idl', 'NMR_free_cholesterol_in_ldl', 'NMR_free_cholesterol_in_large_hdl', 'NMR_free_cholesterol_in_large_ldl', 'NMR_free_cholesterol_in_large_vldl', 'NMR_free_cholesterol_in_medium_hdl', 'NMR_free_cholesterol_in_medium_ldl', 'NMR_free_cholesterol_in_medium_vldl', 'NMR_free_cholesterol_in_small_hdl', 'NMR_free_cholesterol_in_small_ldl', 'NMR_free_cholesterol_in_small_vldl', 'NMR_free_cholesterol_in_vldl', 'NMR_free_cholesterol_in_very_large_hdl', 'NMR_free_cholesterol_in_very_large_vldl', 'NMR_free_cholesterol_in_very_small_vldl', 'NMR_glucose', 'NMR_glutamine', 'NMR_glycine', 'NMR_glycoprotein_acetyls', 'NMR_hdl_cholesterol', 'NMR_histidine', 'NMR_isoleucine', 'NMR_ldl_cholesterol', 'NMR_lactate', 'NMR_leucine', 'NMR_linoleic_acid', 'NMR_monounsaturated_fatty_acids', 'NMR_omega3_fatty_acids', 'NMR_omega6_fatty_acids', 'NMR_phenylalanine', 'NMR_phosphatidylcholines', 'NMR_phosphoglycerides', 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_phospholipids_in_hdl', 'NMR_phospholipids_in_idl', 'NMR_phospholipids_in_ldl', 'NMR_phospholipids_in_large_hdl', 'NMR_phospholipids_in_large_ldl', 'NMR_phospholipids_in_large_vldl', 'NMR_phospholipids_in_medium_hdl', 'NMR_phospholipids_in_medium_ldl', 'NMR_phospholipids_in_medium_vldl', 'NMR_phospholipids_in_small_hdl', 'NMR_phospholipids_in_small_ldl', 'NMR_phospholipids_in_small_vldl', 'NMR_phospholipids_in_vldl', 'NMR_phospholipids_in_very_large_hdl', 'NMR_phospholipids_in_very_large_vldl', 'NMR_phospholipids_in_very_small_vldl', 'NMR_polyunsaturated_fatty_acids', 'NMR_pyruvate', 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol', 'NMR_saturated_fatty_acids', 'NMR_sphingomyelins', 'NMR_total_cholesterol', 'NMR_total_cholesterol_minus_hdlc', 'NMR_total_cholines', 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine', 'NMR_total_concentration_of_lipoprotein_particles', 'NMR_total_esterified_cholesterol', 'NMR_total_fatty_acids', 'NMR_total_free_cholesterol', 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_total_lipids_in_hdl', 'NMR_total_lipids_in_idl', 'NMR_total_lipids_in_ldl', 'NMR_total_lipids_in_large_hdl', 'NMR_total_lipids_in_large_ldl', 'NMR_total_lipids_in_large_vldl', 'NMR_total_lipids_in_lipoprotein_particles', 'NMR_total_lipids_in_medium_hdl', 'NMR_total_lipids_in_medium_ldl', 'NMR_total_lipids_in_medium_vldl', 'NMR_total_lipids_in_small_hdl', 'NMR_total_lipids_in_small_ldl', 'NMR_total_lipids_in_small_vldl', 'NMR_total_lipids_in_vldl', 'NMR_total_lipids_in_very_large_hdl', 'NMR_total_lipids_in_very_large_vldl', 'NMR_total_lipids_in_very_small_vldl', 'NMR_total_phospholipids_in_lipoprotein_particles', 'NMR_total_triglycerides', 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl', 'NMR_triglycerides_in_hdl', 'NMR_triglycerides_in_idl', 'NMR_triglycerides_in_ldl', 'NMR_triglycerides_in_large_hdl', 'NMR_triglycerides_in_large_ldl', 'NMR_triglycerides_in_large_vldl', 'NMR_triglycerides_in_medium_hdl', 'NMR_triglycerides_in_medium_ldl', 'NMR_triglycerides_in_medium_vldl', 'NMR_triglycerides_in_small_hdl', 'NMR_triglycerides_in_small_ldl', 'NMR_triglycerides_in_small_vldl', 'NMR_triglycerides_in_vldl', 'NMR_triglycerides_in_very_large_hdl', 'NMR_triglycerides_in_very_large_vldl', 'NMR_triglycerides_in_very_small_vldl', 'NMR_tyrosine', 'NMR_vldl_cholesterol', 'NMR_valine']":"AgeSexMetabolomics",
                  "['NMR_3hydroxybutyrate', 'NMR_acetate', 'NMR_acetoacetate', 'NMR_acetone', 'NMR_alanine', 'NMR_albumin', 'NMR_apolipoprotein_a1', 'NMR_apolipoprotein_b', 'NMR_average_diameter_for_hdl_particles', 'NMR_average_diameter_for_ldl_particles', 'NMR_average_diameter_for_vldl_particles', 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesterol_in_idl', 'NMR_cholesterol_in_large_hdl', 'NMR_cholesterol_in_large_ldl', 'NMR_cholesterol_in_large_vldl', 'NMR_cholesterol_in_medium_hdl', 'NMR_cholesterol_in_medium_ldl', 'NMR_cholesterol_in_medium_vldl', 'NMR_cholesterol_in_small_hdl', 'NMR_cholesterol_in_small_ldl', 'NMR_cholesterol_in_small_vldl', 'NMR_cholesterol_in_very_large_hdl', 'NMR_cholesterol_in_very_large_vldl', 'NMR_cholesterol_in_very_small_vldl', 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl', 'NMR_cholesteryl_esters_in_hdl', 'NMR_cholesteryl_esters_in_idl', 'NMR_cholesteryl_esters_in_ldl', 'NMR_cholesteryl_esters_in_large_hdl', 'NMR_cholesteryl_esters_in_large_ldl', 'NMR_cholesteryl_esters_in_large_vldl', 'NMR_cholesteryl_esters_in_medium_hdl', 'NMR_cholesteryl_esters_in_medium_ldl', 'NMR_cholesteryl_esters_in_medium_vldl', 'NMR_cholesteryl_esters_in_small_hdl', 'NMR_cholesteryl_esters_in_small_ldl', 'NMR_cholesteryl_esters_in_small_vldl', 'NMR_cholesteryl_esters_in_vldl', 'NMR_cholesteryl_esters_in_very_large_hdl', 'NMR_cholesteryl_esters_in_very_large_vldl', 'NMR_cholesteryl_esters_in_very_small_vldl', 'NMR_citrate', 'NMR_clinical_ldl_cholesterol', 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles', 'NMR_concentration_of_hdl_particles', 'NMR_concentration_of_idl_particles', 'NMR_concentration_of_ldl_particles', 'NMR_concentration_of_large_hdl_particles', 'NMR_concentration_of_large_ldl_particles', 'NMR_concentration_of_large_vldl_particles', 'NMR_concentration_of_medium_hdl_particles', 'NMR_concentration_of_medium_ldl_particles', 'NMR_concentration_of_medium_vldl_particles', 'NMR_concentration_of_small_hdl_particles', 'NMR_concentration_of_small_ldl_particles', 'NMR_concentration_of_small_vldl_particles', 'NMR_concentration_of_vldl_particles', 'NMR_concentration_of_very_large_hdl_particles', 'NMR_concentration_of_very_large_vldl_particles', 'NMR_concentration_of_very_small_vldl_particles', 'NMR_creatinine', 'NMR_degree_of_unsaturation', 'NMR_docosahexaenoic_acid', 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl', 'NMR_free_cholesterol_in_hdl', 'NMR_free_cholesterol_in_idl', 'NMR_free_cholesterol_in_ldl', 'NMR_free_cholesterol_in_large_hdl', 'NMR_free_cholesterol_in_large_ldl', 'NMR_free_cholesterol_in_large_vldl', 'NMR_free_cholesterol_in_medium_hdl', 'NMR_free_cholesterol_in_medium_ldl', 'NMR_free_cholesterol_in_medium_vldl', 'NMR_free_cholesterol_in_small_hdl', 'NMR_free_cholesterol_in_small_ldl', 'NMR_free_cholesterol_in_small_vldl', 'NMR_free_cholesterol_in_vldl', 'NMR_free_cholesterol_in_very_large_hdl', 'NMR_free_cholesterol_in_very_large_vldl', 'NMR_free_cholesterol_in_very_small_vldl', 'NMR_glucose', 'NMR_glutamine', 'NMR_glycine', 'NMR_glycoprotein_acetyls', 'NMR_hdl_cholesterol', 'NMR_histidine', 'NMR_isoleucine', 'NMR_ldl_cholesterol', 'NMR_lactate', 'NMR_leucine', 'NMR_linoleic_acid', 'NMR_monounsaturated_fatty_acids', 'NMR_omega3_fatty_acids', 'NMR_omega6_fatty_acids', 'NMR_phenylalanine', 'NMR_phosphatidylcholines', 'NMR_phosphoglycerides', 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_phospholipids_in_hdl', 'NMR_phospholipids_in_idl', 'NMR_phospholipids_in_ldl', 'NMR_phospholipids_in_large_hdl', 'NMR_phospholipids_in_large_ldl', 'NMR_phospholipids_in_large_vldl', 'NMR_phospholipids_in_medium_hdl', 'NMR_phospholipids_in_medium_ldl', 'NMR_phospholipids_in_medium_vldl', 'NMR_phospholipids_in_small_hdl', 'NMR_phospholipids_in_small_ldl', 'NMR_phospholipids_in_small_vldl', 'NMR_phospholipids_in_vldl', 'NMR_phospholipids_in_very_large_hdl', 'NMR_phospholipids_in_very_large_vldl', 'NMR_phospholipids_in_very_small_vldl', 'NMR_polyunsaturated_fatty_acids', 'NMR_pyruvate', 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol', 'NMR_saturated_fatty_acids', 'NMR_sphingomyelins', 'NMR_total_cholesterol', 'NMR_total_cholesterol_minus_hdlc', 'NMR_total_cholines', 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine', 'NMR_total_concentration_of_lipoprotein_particles', 'NMR_total_esterified_cholesterol', 'NMR_total_fatty_acids', 'NMR_total_free_cholesterol', 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl', 'NMR_total_lipids_in_hdl', 'NMR_total_lipids_in_idl', 'NMR_total_lipids_in_ldl', 'NMR_total_lipids_in_large_hdl', 'NMR_total_lipids_in_large_ldl', 'NMR_total_lipids_in_large_vldl', 'NMR_total_lipids_in_lipoprotein_particles', 'NMR_total_lipids_in_medium_hdl', 'NMR_total_lipids_in_medium_ldl', 'NMR_total_lipids_in_medium_vldl', 'NMR_total_lipids_in_small_hdl', 'NMR_total_lipids_in_small_ldl', 'NMR_total_lipids_in_small_vldl', 'NMR_total_lipids_in_vldl', 'NMR_total_lipids_in_very_large_hdl', 'NMR_total_lipids_in_very_large_vldl', 'NMR_total_lipids_in_very_small_vldl', 'NMR_total_phospholipids_in_lipoprotein_particles', 'NMR_total_triglycerides', 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl', 'NMR_triglycerides_in_hdl', 'NMR_triglycerides_in_idl', 'NMR_triglycerides_in_ldl', 'NMR_triglycerides_in_large_hdl', 'NMR_triglycerides_in_large_ldl', 'NMR_triglycerides_in_large_vldl', 'NMR_triglycerides_in_medium_hdl', 'NMR_triglycerides_in_medium_ldl', 'NMR_triglycerides_in_medium_vldl', 'NMR_triglycerides_in_small_hdl', 'NMR_triglycerides_in_small_ldl', 'NMR_triglycerides_in_small_vldl', 'NMR_triglycerides_in_vldl', 'NMR_triglycerides_in_very_large_hdl', 'NMR_triglycerides_in_very_large_vldl', 'NMR_triglycerides_in_very_small_vldl', 'NMR_tyrosine', 'NMR_vldl_cholesterol', 'NMR_valine']":"Metabolomics"}
    predictions["features"] = predictions["feature_names"].map(feature_map)
    return predictions

In [20]:
preds_all_cleaned = clean_df(df, preds_all)

In [None]:
preds_all_cleaned

In [22]:
def categorify(preds):
    for col in tqdm(preds.columns.to_list()):
        if preds[col].dtype == "object": preds[col]= preds[col].astype("category")
    return preds
preds_all_cleaned = categorify(preds_all_cleaned)

  0%|          | 0/1856 [00:00<?, ?it/s]

In [23]:
def fix_column_names(df):
    # rename and fix time bugs!!! -> 0_11_Ft -> Ft at t=10 -> fix earlier
    time_fix_map = dict(zip([col for col in df.columns if "ft" in col], [f"ft_{col}" for col in range(1, len([col for col in df.columns if "ft" in col])+1)]))
    df = df.rename(time_fix_map, axis="columns")
    return df

def wide_to_long(df, endpoint):
    preds_endpoints = []
    for endpoint in tqdm(endpoints):
        time_cols = [f"1_1_ft__{endpoint}"]
        temp = df[["eid", 'features', 'split', 'partition', 'module', 'datamodule'] + time_cols].assign(endpoint = endpoint)[["eid", "endpoint", 'features', 'split', 'partition', 'module', 'datamodule'] + time_cols]
        preds_endpoints.append(fix_column_names(temp))
    df_long = pd.concat(preds_endpoints, axis=0).reset_index(drop=True)
    return df_long.rename(columns={"ft_1":"logh"})
    
preds = wide_to_long(preds_all_cleaned, endpoints)

  0%|          | 0/24 [00:00<?, ?it/s]

In [24]:
#endpoint = "M_MACE"; 
events=[endpoint+'_event' for endpoint in endpoints] 
times=[endpoint+'_event_time' for endpoint in endpoints]

In [25]:
preds_wide = preds.assign(endpoint=lambda x: "logh_" + x.endpoint).\
    pivot(["eid", 'features', 'split', 'partition', 'module', 'datamodule'], columns="endpoint", values="logh").reset_index(drop=False)

In [26]:
# get query features to get wide logh
from functools import reduce
def features_to_wide(predictions):
    preds_features = []
    cols_base = ['eid', 'split', 'partition', 'module', 'datamodule']
    cols_loghs = [f"logh_{endpoint}" for endpoint in endpoints]
    for feature_set in preds_wide.features.unique().to_list():
        temp = preds_wide.query("features==@feature_set")[cols_base+cols_loghs]
        temp.columns = cols_base + [f"{col}_{feature_set}" for col in cols_loghs]
        preds_features.append(temp)
    preds_final = reduce(lambda left,right: pd.merge(left,right,on=cols_base), preds_features)
    return preds_final

In [None]:
preds_wide_fs = features_to_wide(preds_wide)
preds_wide_fs

In [28]:
preds_wide_fs.to_feather(f"{data_results_path}/loghazards_model_{run}_metabolomics.feather")

In [29]:
preds_wide_fs = pd.read_feather(f"{data_results_path}/loghazards_model_{run}_metabolomics.feather")

In [None]:
data_temp = pd.read_feather(f"{data_post}/data_merged.feather")
eids_dict = {}
for endpoint in tqdm(endpoints):
    eids_incl = data_temp.query(f"NMR_FLAG==True&{endpoint}==False").eid.to_list()
    if endpoint == "M_MACE": eids_incl = data_temp.copy().query(f"NMR_FLAG==True&{endpoint}==False&statins==False").eid.to_list()
    if endpoint == "M_breast_cancer": eids_incl = data_temp.copy().query(f"NMR_FLAG==True&{endpoint}==False&sex=='Female'").eid.to_list()
    if endpoint == "M_ovarian_cancer": eids_incl = data_temp.copy().query(f"NMR_FLAG==True&{endpoint}==False&sex=='Female'").eid.to_list()
    if endpoint == "M_uterus_cancer": eids_incl = data_temp.copy().query(f"NMR_FLAG==True&{endpoint}==False&sex=='Female'").eid.to_list()
    if endpoint == "M_prostate_cancer": eids_incl = data_temp.copy().query(f"NMR_FLAG==True&{endpoint}==False&sex=='Male'").eid.to_list()
    print(endpoint, len(eids_incl))
    eids_dict[endpoint] = eids_incl

In [31]:
metabolomics = [
'NMR_3hydroxybutyrate',
 'NMR_acetate',
 'NMR_acetoacetate',
 'NMR_acetone',
 'NMR_alanine',
 'NMR_albumin',
 'NMR_apolipoprotein_a1',
 'NMR_apolipoprotein_b',
 'NMR_average_diameter_for_hdl_particles',
 'NMR_average_diameter_for_ldl_particles',
 'NMR_average_diameter_for_vldl_particles',
 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 'NMR_cholesterol_in_idl',
 'NMR_cholesterol_in_large_hdl',
 'NMR_cholesterol_in_large_ldl',
 'NMR_cholesterol_in_large_vldl',
 'NMR_cholesterol_in_medium_hdl',
 'NMR_cholesterol_in_medium_ldl',
 'NMR_cholesterol_in_medium_vldl',
 'NMR_cholesterol_in_small_hdl',
 'NMR_cholesterol_in_small_ldl',
 'NMR_cholesterol_in_small_vldl',
 'NMR_cholesterol_in_very_large_hdl',
 'NMR_cholesterol_in_very_large_vldl',
 'NMR_cholesterol_in_very_small_vldl',
 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl',
 'NMR_cholesteryl_esters_in_hdl',
 'NMR_cholesteryl_esters_in_idl',
 'NMR_cholesteryl_esters_in_ldl',
 'NMR_cholesteryl_esters_in_large_hdl',
 'NMR_cholesteryl_esters_in_large_ldl',
 'NMR_cholesteryl_esters_in_large_vldl',
 'NMR_cholesteryl_esters_in_medium_hdl',
 'NMR_cholesteryl_esters_in_medium_ldl',
 'NMR_cholesteryl_esters_in_medium_vldl',
 'NMR_cholesteryl_esters_in_small_hdl',
 'NMR_cholesteryl_esters_in_small_ldl',
 'NMR_cholesteryl_esters_in_small_vldl',
 'NMR_cholesteryl_esters_in_vldl',
 'NMR_cholesteryl_esters_in_very_large_hdl',
 'NMR_cholesteryl_esters_in_very_large_vldl',
 'NMR_cholesteryl_esters_in_very_small_vldl',
 'NMR_citrate',
 'NMR_clinical_ldl_cholesterol',
 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles',
 'NMR_concentration_of_hdl_particles',
 'NMR_concentration_of_idl_particles',
 'NMR_concentration_of_ldl_particles',
 'NMR_concentration_of_large_hdl_particles',
 'NMR_concentration_of_large_ldl_particles',
 'NMR_concentration_of_large_vldl_particles',
 'NMR_concentration_of_medium_hdl_particles',
 'NMR_concentration_of_medium_ldl_particles',
 'NMR_concentration_of_medium_vldl_particles',
 'NMR_concentration_of_small_hdl_particles',
 'NMR_concentration_of_small_ldl_particles',
 'NMR_concentration_of_small_vldl_particles',
 'NMR_concentration_of_vldl_particles',
 'NMR_concentration_of_very_large_hdl_particles',
 'NMR_concentration_of_very_large_vldl_particles',
 'NMR_concentration_of_very_small_vldl_particles',
 'NMR_creatinine',
 'NMR_degree_of_unsaturation',
 'NMR_docosahexaenoic_acid',
 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 'NMR_free_cholesterol_in_hdl',
 'NMR_free_cholesterol_in_idl',
 'NMR_free_cholesterol_in_ldl',
 'NMR_free_cholesterol_in_large_hdl',
 'NMR_free_cholesterol_in_large_ldl',
 'NMR_free_cholesterol_in_large_vldl',
 'NMR_free_cholesterol_in_medium_hdl',
 'NMR_free_cholesterol_in_medium_ldl',
 'NMR_free_cholesterol_in_medium_vldl',
 'NMR_free_cholesterol_in_small_hdl',
 'NMR_free_cholesterol_in_small_ldl',
 'NMR_free_cholesterol_in_small_vldl',
 'NMR_free_cholesterol_in_vldl',
 'NMR_free_cholesterol_in_very_large_hdl',
 'NMR_free_cholesterol_in_very_large_vldl',
 'NMR_free_cholesterol_in_very_small_vldl',
 'NMR_glucose',
 'NMR_glutamine',
 'NMR_glycine',
 'NMR_glycoprotein_acetyls',
 'NMR_hdl_cholesterol',
 'NMR_histidine',
 'NMR_isoleucine',
 'NMR_ldl_cholesterol',
 'NMR_lactate',
 'NMR_leucine',
 'NMR_linoleic_acid',
 'NMR_monounsaturated_fatty_acids',
 'NMR_omega3_fatty_acids',
 'NMR_omega6_fatty_acids',
 'NMR_phenylalanine',
 'NMR_phosphatidylcholines',
 'NMR_phosphoglycerides',
 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl',
 'NMR_phospholipids_in_hdl',
 'NMR_phospholipids_in_idl',
 'NMR_phospholipids_in_ldl',
 'NMR_phospholipids_in_large_hdl',
 'NMR_phospholipids_in_large_ldl',
 'NMR_phospholipids_in_large_vldl',
 'NMR_phospholipids_in_medium_hdl',
 'NMR_phospholipids_in_medium_ldl',
 'NMR_phospholipids_in_medium_vldl',
 'NMR_phospholipids_in_small_hdl',
 'NMR_phospholipids_in_small_ldl',
 'NMR_phospholipids_in_small_vldl',
 'NMR_phospholipids_in_vldl',
 'NMR_phospholipids_in_very_large_hdl',
 'NMR_phospholipids_in_very_large_vldl',
 'NMR_phospholipids_in_very_small_vldl',
 'NMR_polyunsaturated_fatty_acids',
 'NMR_pyruvate',
 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol',
 'NMR_saturated_fatty_acids',
 'NMR_sphingomyelins',
 'NMR_total_cholesterol',
 'NMR_total_cholesterol_minus_hdlc',
 'NMR_total_cholines',
 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine',
 'NMR_total_concentration_of_lipoprotein_particles',
 'NMR_total_esterified_cholesterol',
 'NMR_total_fatty_acids',
 'NMR_total_free_cholesterol',
 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl',
 'NMR_total_lipids_in_hdl',
 'NMR_total_lipids_in_idl',
 'NMR_total_lipids_in_ldl',
 'NMR_total_lipids_in_large_hdl',
 'NMR_total_lipids_in_large_ldl',
 'NMR_total_lipids_in_large_vldl',
 'NMR_total_lipids_in_lipoprotein_particles',
 'NMR_total_lipids_in_medium_hdl',
 'NMR_total_lipids_in_medium_ldl',
 'NMR_total_lipids_in_medium_vldl',
 'NMR_total_lipids_in_small_hdl',
 'NMR_total_lipids_in_small_ldl',
 'NMR_total_lipids_in_small_vldl',
 'NMR_total_lipids_in_vldl',
 'NMR_total_lipids_in_very_large_hdl',
 'NMR_total_lipids_in_very_large_vldl',
 'NMR_total_lipids_in_very_small_vldl',
 'NMR_total_phospholipids_in_lipoprotein_particles',
 'NMR_total_triglycerides',
 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl',
 'NMR_triglycerides_in_hdl',
 'NMR_triglycerides_in_idl',
 'NMR_triglycerides_in_ldl',
 'NMR_triglycerides_in_large_hdl',
 'NMR_triglycerides_in_large_ldl',
 'NMR_triglycerides_in_large_vldl',
 'NMR_triglycerides_in_medium_hdl',
 'NMR_triglycerides_in_medium_ldl',
 'NMR_triglycerides_in_medium_vldl',
 'NMR_triglycerides_in_small_hdl',
 'NMR_triglycerides_in_small_ldl',
 'NMR_triglycerides_in_small_vldl',
 'NMR_triglycerides_in_vldl',
 'NMR_triglycerides_in_very_large_hdl',
 'NMR_triglycerides_in_very_large_vldl',
 'NMR_triglycerides_in_very_small_vldl',
 'NMR_tyrosine',
 'NMR_vldl_cholesterol',
 'NMR_valine'
]

In [32]:
AgeSex = ["age_at_recruitment", "sex"]

ASCVDnoblood = [
    "age_at_recruitment", 
    "sex",
    #"cholesterol", 
   # "hdl_cholesterol", 
    "systolic_blood_pressure", 
    "antihypertensives", 
    "diabetes2", 
    "smoking_status_2.0" # current smoker
] 

ASCVD = [
    "age_at_recruitment", 
    "sex",
    "cholesterol", 
    "hdl_cholesterol", 
    "systolic_blood_pressure", 
    "antihypertensives", 
    "diabetes2", 
    "smoking_status_2.0" # current smoker
] 

PANELnoblood = [ # 38 clinical predictors
    # basics
    "age_at_recruitment", 
    "sex", 
    'smoking_status_2.0', # current smoker
    'alcohol_intake_frequency_2.0', # 'Daily or almost daily'
    "daily_physical_activity",
    "education_years",
    "daily_healthy_food",
    
    # family history
    "fh_diabetes",
    
    # diagnoses
    "diabetes2",
        
    # physical
    "body_mass_index_bmi", 
    'waist_hip_ratio',
    "waist_circumference",
    "weight", 
    "standing_height", 
    "systolic_blood_pressure", 
    
    # medications
    'antihypertensives'
        ] 

PANELjustbloodcount = [ # 38 clinical predictors
    # basics
    "age_at_recruitment", 
    "sex", 
    'smoking_status_2.0', # current smoker
    'alcohol_intake_frequency_2.0', # 'Daily or almost daily'
    "daily_physical_activity",
    "education_years",
    "daily_healthy_food",
    
    # family history
    "fh_diabetes",
    
    # diagnoses
    "diabetes2",
        
    # physical
    "body_mass_index_bmi", 
    'waist_hip_ratio',
    "waist_circumference",
    "weight", 
    "standing_height", 
    "systolic_blood_pressure", 
        
    # Blood counts
    'red_blood_cell_erythrocyte_count',
    'white_blood_cell_leukocyte_count',
    'platelet_count',
    'haemoglobin_concentration',
    'haematocrit_percentage',
    'mean_corpuscular_haemoglobin',
    'mean_corpuscular_volume',
    'mean_corpuscular_haemoglobin_concentration',
    
    # medications
    'antihypertensives'
        ] 

PANEL = [ # 38 clinical predictors
    # basics
    "age_at_recruitment", 
    "sex", 
    'smoking_status_2.0', # current smoker
    'alcohol_intake_frequency_2.0', # 'Daily or almost daily'
    "daily_physical_activity",
    "education_years",
    "daily_healthy_food",
    
    # family history
    "fh_diabetes",
    
    # diagnoses
    "diabetes2",
        
    # physical
    "body_mass_index_bmi", 
    'waist_hip_ratio',
    "waist_circumference",
    "weight", 
    "standing_height", 
    "systolic_blood_pressure", 
    
    # lipids
    "cholesterol", 
    "ldl_direct", 
    "hdl_cholesterol",
    "triglycerides",
    
    # diabetes
    'glucose',
    'glycated_haemoglobin_hba1c',
    
    # kidney
    'creatinine',
    'cystatin_c',
    'urea',
    'urate',
        
    # liver
    'aspartate_aminotransferase',
    'alanine_aminotransferase',
    'alkaline_phosphatase',
    'albumin',
    
    # inflammation
    'creactive_protein',
    
    # Blood counts
    'red_blood_cell_erythrocyte_count',
    'white_blood_cell_leukocyte_count',
    'platelet_count',
    'haemoglobin_concentration',
    'haematocrit_percentage',
    'mean_corpuscular_haemoglobin',
    'mean_corpuscular_volume',
    'mean_corpuscular_haemoglobin_concentration',
    
    # medications
    'antihypertensives'
        ] 



In [33]:
# COX + logh(Metabolomics) + (logh(AgeSexMetabolomics))

In [34]:
%env MKL_NUM_THREADS=1
%env NUMEXPR_NUM_THREADS=1
%env OMP_NUM_THREADS=1

env: MKL_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


In [41]:
ray.shutdown()

NameError: name 'ray' is not defined

In [None]:
import ray

ray.init(num_cpus=30)#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

In [37]:
@ray.remote
def get_feather(data_post, partition, split):
    return pd.read_feather(f"{data_post}/partition_{partition}/{split}/data_imputed_normalized.feather").assign(partition=partition, split=split)
    
def get_data():
    return [get_feather.remote(data_post, partition, split) for split in ["train", "valid", "test"] for partition in tqdm(range(22))]

In [None]:
%time
data_raw = pd.concat(ray.get(get_data()), axis=0)

## Add metabolomics PCA

In [39]:
def pca_transform(data_split, split, pca, pca_cols, metabolomics):
    pca_X = pca.transform(data_split[metabolomics].to_numpy())
    return pd.DataFrame(pca_X, columns=pca_cols, index=data_split.eid).assign(split=split)
    
@ray.remote
def fit_pca_partition(data_partition, partition, metabolomics, n_components):
    pca_cols = [f"NMR_PCA{component}" for component in range(n_components)]
    pca = PCA(n_components=n_components)
    x_train = data_partition.query("split=='train'")[metabolomics].to_numpy()
    pca.fit(x_train)
    print(pca.explained_variance_ratio_)
    
    pcas_splits = []
    for split in ["train", "valid", "test"]:
        data_split = data_partition.query("split==@split")
        pcas_splits.append(pca_transform(data_split, split, pca, pca_cols, metabolomics).assign(partition=partition))

    return pd.concat(pcas_splits, axis=0)

In [40]:
from sklearn.decomposition import PCA
n_components = 10
pcas_partitions = []
for partition in tqdm([int(p) for p in partitions]):
    data_partition = data_raw.query("partition==@partition")
    pcas_partitions.append(fit_pca_partition.remote(data_partition, partition, metabolomics, n_components))

  0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
pca_data = pd.concat(ray.get(pcas_partitions), axis=0)

In [42]:
pca_data = pca_data.reset_index().set_index(["partition", "split", "eid"])
data_raw = data_raw.set_index(["partition", "split", "eid"])
preds_wide_fs = preds_wide_fs.set_index(["partition", "split", "eid"])

In [43]:
data_raw_pca = data_raw.merge(pca_data, left_index=True, right_index=True, how="left")

In [44]:
data_raw_us = data_raw_pca.merge(preds_wide_fs, left_index=True, right_index=True, how="left").reset_index()

## Add extra data

In [45]:
decoded = pd.read_feather(f"{base_path}/data/1_decoded/ukb_data_210517.feather")

In [46]:
from sklearn.preprocessing import StandardScaler
extra_data = decoded[["eid", 'waist_circumference_f48_0_0', 'hip_circumference_f49_0_0', 'duration_of_moderate_activity_f894_0_0', 'duration_of_vigorous_activity_f914_0_0']]

extra_data['waist_circumference_f48_0_0'] = extra_data['waist_circumference_f48_0_0'].fillna(extra_data['waist_circumference_f48_0_0'].mean())
extra_data['hip_circumference_f49_0_0'] = extra_data['hip_circumference_f49_0_0'].fillna(extra_data['hip_circumference_f49_0_0'].mean())

extra_data['duration_of_moderate_activity_f894_0_0'] = extra_data['duration_of_moderate_activity_f894_0_0'].fillna(0).replace(-1, 0).replace(-3, 0)
extra_data['duration_of_vigorous_activity_f914_0_0'] = extra_data['duration_of_vigorous_activity_f914_0_0'].fillna(0).replace(-1, 0).replace(-3, 0)

#extra_data["waist_circumference"] = extra_data['waist_circumference_f48_0_0']
extra_data["waist_hip_ratio"] = extra_data['waist_circumference_f48_0_0']/extra_data['hip_circumference_f49_0_0']                                           
extra_data["daily_physical_activity"] = extra_data['duration_of_moderate_activity_f894_0_0']+extra_data['duration_of_vigorous_activity_f914_0_0']

In [47]:
score_data = pd.read_feather(f"{data_results_path}/add_data_211005.feather")
extra_data = extra_data.set_index("eid").merge(score_data.set_index("eid"), left_index=True, right_index=True, how="left").reset_index()

In [None]:
extra_data = extra_data[["eid"]+["waist_hip_ratio", "daily_physical_activity", "education_years", "daily_healthy_food"]]
extra_data

In [49]:
extra_data.to_feather(f"{data_results_path}/extra_data_220126.feather")

In [50]:
data_raw_us = data_raw_us.set_index("eid").merge(extra_data.set_index("eid"), left_index=True, right_index=True, how="left").reset_index()

## Standardize logh and extra data

In [51]:
def norm_variables(data_partition, variables):
    norm = StandardScaler(with_mean=True, with_std=True, copy=True).fit(data_partition.query("split=='train'")[variables].values)

    data_partition[variables] = norm.transform(data_partition[variables].values)
    
    return data_partition

def norm_logh_and_extra(data_all, variables):
    data_new = data_all.copy()
    dfs = []
    for partition in tqdm([int(p) for p in partitions]):
        data_partition = data_new.query("partition==@partition")
        dfs.append(norm_variables(data_partition, variables))
    return pd.concat(dfs, axis=0)

In [52]:
variables_to_norm = ["waist_hip_ratio", "daily_physical_activity", "education_years"] + [col for col in data_raw_us.columns if "logh" and "_Metabolomics" in col]
data_all = norm_logh_and_extra(data_raw_us, variables_to_norm)

  0%|          | 0/22 [00:00<?, ?it/s]

In [53]:
data_all = data_all.reset_index()

In [54]:
data_all.to_feather(f"{data_results_path}/data_all_COX_{run}_metabolomics.feather")

# Train COX

In [55]:
modules = ["PCA", "COX", "DS"]

In [56]:
partitions=data_all.partition.unique().tolist()

In [57]:
data_train = data_all.query("split=='train'") 
data_test = data_all.query("split=='test'")

In [58]:
version=f"COX_{run}"
dump_path = f"{data_post}/{version}"
pathlib.Path(dump_path).mkdir(parents=True, exist_ok=True)

In [76]:
from lifelines import CoxPHFitter
from dask.diagnostics import ProgressBar
from lifelines.exceptions import ConvergenceError
import pickle

def flatten_dict(features):
    df = pd.json_normalize(d, sep='_')
    t = list(df.to_dict(orient='records')[0].values())
    flat_list = [item for sublist in t for item in sublist]
    return list(set(flat_list))

def flatten_list(l):
    return [item for sublist in l for item in sublist]

def get_all_features(metabolomics, endpoints):
    f_dicts = flatten_list([[ds for ds in get_features(metabolomics, endpoint).values()] for endpoint in endpoints])
    fs = list(sorted(set(flatten_list(flatten_list([list(p.values()) for p in f_dicts])))))
    return fs + [f"{endpoint}_event" for endpoint in endpoints] + [f"{endpoint}_event_time" for endpoint in endpoints]

def get_features(metabolomics, endpoint):
    features = {"COX": {"Metabolomics": metabolomics,
                        "Age+Sex": AgeSex, 
                        "ASCVDnoblood": ASCVDnoblood,
                        "ASCVD": ASCVD,
                        "PANELnoblood": PANELnoblood,
                        "PANELjustbloodcount": PANELjustbloodcount,
                        "PANEL": PANEL,
                       },
                "PCA": {"Metabolomics": [f'NMR_PCA{i}' for i in range(10)]},
                
                "DS":  {"Metabolomics": [f"logh_{endpoint}_Metabolomics"], 
                        "Age+Sex+Metabolomics": AgeSex+[f"logh_{endpoint}_Metabolomics"], 
                        "ASCVDnoblood+Metabolomics": ASCVDnoblood+[f"logh_{endpoint}_Metabolomics"],
                         "ASCVD+Metabolomics": ASCVD+[f"logh_{endpoint}_Metabolomics"],
                        "PANELnoblood+Metabolomics": PANELnoblood+[f"logh_{endpoint}_Metabolomics"],
                        "PANELjustbloodcount+Metabolomics": PANELjustbloodcount+[f"logh_{endpoint}_Metabolomics"],
                        "PANEL+Metabolomics": PANEL+[f"logh_{endpoint}_Metabolomics"],
                       }
               }
    
    return features
    
def fit_cox(train, endpoint, penalizer, step_size):
    cph = CoxPHFitter(penalizer=penalizer)
    cph.fit(train, f"{endpoint}_event_time", f"{endpoint}_event", step_size=step_size, show_progress=True)
    return cph

@ray.remote
def fit_partition(train_partition, endpoint, module, features, partition):
    for feature_set, covariates in tqdm(features[module].items()):
        cph_path = f"{dump_path}/{module}_{endpoint}_{feature_set}_{partition}.p"
        if endpoint == "M_type_2_diabetes":
            if "diabetes1" in covariates: covariates = [c for c in covariates if c!="diabetes1"]
            if "diabetes2" in covariates: covariates = [c for c in covariates if c!="diabetes2"]
        if endpoint in ["M_prostate_cancer", "M_ovarian_cancer", "M_breast_cancer", "M_uterus_cancer"]:
            if "sex" in covariates: covariates = [c for c in covariates if c!="sex"]
        if not os.path.isfile(cph_path):
            try:
                cph = fit_cox(train_partition[covariates + [f"{endpoint}_event", f"{endpoint}_event_time"]], endpoint, penalizer=0.0, step_size=0.5)
                pickle.dump(cph, open(cph_path, "wb"))
            except ConvergenceError:
                print("ConvergenceError", module, endpoint, feature_set, partition, "problem: reduce step size")
                try:
                    cph = fit_cox(train_partition[covariates + [f"{endpoint}_event", f"{endpoint}_event_time"]], endpoint, penalizer=0.0, step_size=0.1)
                    pickle.dump(cph, open(cph_path, "wb"))
                    print("ConvergenceError", module, endpoint, feature_set, partition, "trying with reduced step size ... successfull")
                except:
                    print("ConvergenceError", module, endpoint, feature_set, partition, "trying with reduced step size ... failed")
    return True

def train_cox(endpoints, modules, partitions, data_train):
    frame_dict = {}
    features_all = get_all_features(metabolomics, endpoints)
    data_train = data_train[["eid", "partition", "split"]+features_all]
    with ProgressBar():   
        for module in tqdm(modules):
            for endpoint in tqdm(endpoints):
                features = get_features(metabolomics, endpoint)
                eids_incl = eids_dict[endpoint] 
                train_endpoint = data_train.query("eid==@eids_incl")
                for partition in partitions:
                    train_partition = train_endpoint.query("partition==@partition")
                    frame_dict[f"{module}_{endpoint}_{partition}"] = fit_partition.remote(train_partition, endpoint, module, features, partition)
                    del train_partition
    return frame_dict

In [56]:
ray.shutdown()

In [None]:
endpoints

In [None]:
frame_dict = train_cox(endpoints, modules, partitions, data_train)

In [None]:
ray.shutdown()

## Apply COX to data

In [91]:
import joblib
@ray.remote
def get_cph(path): 
    with open(path,'rb') as f:
        cph = pickle.load(f)
    return cph#return pd.read_csv(f"{path[:-8]}.csv", index_col=0)

In [211]:
ray.shutdown()

In [None]:
import glob, os
cph_dict = {}
for path in tqdm(glob.glob(f"{dump_path}/*.p")):
    cph_dict[pathlib.Path(path).stem] = get_cph.remote(path)

In [80]:
for path in tqdm(glob.glob(f"{dump_path}/*.p")):
    cph_dict[pathlib.Path(path).stem] = ray.get(cph_dict[pathlib.Path(path).stem])

  0%|          | 0/9504 [00:00<?, ?it/s]

In [81]:
def predict_cox(endpoints, modules, partitions, data_test, cph_dict):
    times = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    time_cols = {t: f"Ft_{t}" for t in times}
    
    results = []
    test_preds = []
    for module in tqdm(modules):
        for endpoint in tqdm(endpoints):
            features = get_features(metabolomics, endpoint)
            eids_incl = eids_dict[endpoint] 
            test_endpoint = data_test.query("eid==@eids_incl")
            for partition in partitions:
                test_partition = test_endpoint.query("partition==@partition")
                for feature_set, covariates in features[module].items():
                    #print(endpoint, feature_set)
                    if endpoint == "M_type_2_diabetes":
                        if "diabetes1" in covariates: covariates = [c for c in covariates if c!="diabetes1"]
                        if "diabetes2" in covariates: covariates = [c for c in covariates if c!="diabetes2"]
                    if endpoint in ["M_prostate_cancer", "M_ovarian_cancer", "M_breast_cancer", "M_uterus_cancer"]:
                        if "sex" in covariates: covariates = [c for c in covariates if c!="sex"]
                    cph = cph_dict[f"{module}_{endpoint}_{feature_set}_{partition}"]
                    surv_test = 1-cph.predict_survival_function(test_partition[covariates+ [f"{endpoint}_event", f"{endpoint}_event_time"]], times=times) 
                    temp_pred = test_partition.reset_index()[["eid"]].assign(endpoint=endpoint, module=module, features=feature_set, partition=partition)
                    for t, col in time_cols.items(): temp_pred[col] = surv_test.T[t].to_list()
                    test_preds.append(temp_pred)
                    results.append({"endpoint": endpoint,
                                    "module": module,
                                    "features": feature_set,
                                    "HR_dict": cph.hazard_ratios_.to_dict(),
                                    "partition": partition,
                                    "cph": cph,
                                   })
    return results, test_preds

In [None]:
results, test_preds = predict_cox(endpoints, modules, partitions, data_test, cph_dict)

## Save Predictions

In [83]:
predictions = pd.concat(test_preds, axis=0).reset_index()

In [84]:
predictions.to_feather(f"{data_results_path}/predictions_{run}_metabolomics.feather")

In [None]:
predictions.value_counts(["module", "features"])

In [None]:
predictions.value_counts(["endpoint"])

## Save COX models

In [87]:
results_df = pd.DataFrame().append(results, ignore_index=True)

In [None]:
results_df

In [89]:
def get_hr(hr_dict, endpoint): 
    if f"logh_{endpoint}_Metabolomics" in hr_dict: 
        hr = hr_dict[f"logh_{endpoint}_Metabolomics"]
    else:
        hr = np.nan
    return hr

In [90]:
import pickle
results_df["HR_metabolomics"] = [get_hr(hr_dict, endpoint) for endpoint, hr_dict in zip(results_df["endpoint"], results_df["HR_dict"])]
results_df.drop(columns=["cph"]).to_feather(f"{data_results_path}/cox_{run}_metabolomics.feather")

In [87]:
import glob
import pickle
import re

In [88]:
def _extract_logh(d):
    for k, v in d.items():
        if k.startswith('logh_'):
            return v

In [91]:
# load models and put HRs into a df:
df_dict = {"endpoint": [],
           "partition": [],
           "features": [],
           "HR_Metabolomics": []
          }

for fp in glob.glob(os.path.join(dump_path,"DS_*.p")):
    try:
        res = re.search("(M_[a-zA-Z_]+[2a-zA-Z_]+)_([3a-zA-Z+]+)_(\d+)", fp).groups()

        df_dict['endpoint'].append(res[0])
        df_dict['features'].append(res[1])
        df_dict['partition'].append(res[2])

        cph = pickle.load(open(fp, "rb"))
        HR = _extract_logh(cph.hazard_ratios_.to_dict())
    #     HR = cph.hazard_ratios_.to_dict()[f'logh_{res[0]}_Metabolomics']
        df_dict['HR_Metabolomics'].append(HR)
    except:
        print(fp)

In [93]:
HR_df = pd.DataFrame.from_dict(df_dict, orient="columns")
HR_df['module'] = 'DS'

In [75]:
HR_df.to_feather(f"{data_results_path}/MET_HRs_210819_metabolomics.feather")

In [44]:
results_df = results_df.assign(HR_logh=results_df['hazard_ratio'].apply(_extract_logh))

In [45]:
results_df['features'].unique()

array(['Age+Sex', 'Metabolomics', 'Age+Sex+Metabolomics',
       'AgeSexMetabolomics'], dtype=object)

In [37]:
results_df[["endpoint", "module", "features", 'HR_logh']].reset_index().to_feather(f"{data_results_path}/MET_HRs_{run}_metabolomics.feather")

In [None]:
raise NotImplementedError()

In [38]:
# Other stuff

In [None]:
cph = results_df.set_index(["endpoint", "module", "features"]).loc[('M_MACE', "DS", "Age+Sex+Metabolomics"), "cph"].iloc[0]

In [None]:
import plotly.express as px
scores_plot = ["COX_AgeSex", "DeepSurv_AgeSexMetabolomics"]
temp = results_df.assign(score = lambda x: x.module + "_" + x.features).query("score==@scores_plot")
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=12, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", 
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
#fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", 
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
#fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
#fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
dpath = "/data/analysis/ag-reils/steinfej/data/3_datasets_post/210730_imaging_visit"
for partition in range(5):
    for split in ["train", "valid", "test"]:
        ddf = pd.read_feather(f"{dpath}/partition_{partition}/{split}/data.feather")
        print(partition, split, len(ddf))

In [None]:
data_X = pd.read_feather("/data/analysis/ag-reils/ag-reils-shared/cardioRS/data/3_datasets_post/210730_imaging_visit/data_merged.feather")

In [None]:
data_X.set_index("uk_biobank_assessment_centre_2_0").index.value_counts().index.to_list()

In [None]:
data_X.set_index("uk_biobank_assessment_centre_2_0").index.value_counts()

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
#fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
#fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.scatter(temp, y="cindex", x="score", color="features", facet_col="endpoint", facet_col_wrap=5, template="plotly_dark",
               category_orders={"features": ["COX_Metabolomics", "DeepSurv_Metabolomics", 
                                             "COX_AgeSex", "DeepSurv_AgeSex",
                                             "COX_AgeSexMetabolomics", "DeepSurv_AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
#fig.update_yaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
import plotly.express as px
temp = results_df.assign(score = lambda x: x.module + "_" + x.features)
fig = px.violin(temp, y="cindex", x="features", color="features", box=True, points="all", facet_col="endpoint", facet_col_wrap=4,
               category_orders={"features": ["Metabolomics", "AgeSex", "AgeSexMetabolomics"]})
fig.update_xaxes(matches=None)
fig.show("png", width=1500, height=1000)

In [None]:
preds.to_feather(f"{data_results_path}/predictions_model_210720.feather")

In [None]:
from lifelines import RegressionFitter

In [None]:
self._central_values = self._compute_central_values_of_raw_training_data(df, self.strata)
self.baseline_survival_ = self._compute_baseline_survival()
baseline_hazard_ = self._compute_baseline_hazards(predicted_partial_hazards_)
baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard(baseline_hazard_)

In [None]:
cph_semi = CoxPHFitter().fit(rossi, 'week', event_col='arrest')
cph_piecewise = CoxPHFitter(baseline_estimation_method="piecewise", breakpoints=[20, 35]).fit(rossi, 'week', event_col='arrest')

ax = cph_spline.baseline_cumulative_hazard_.plot()