# Benchmarks

## Initialize

In [None]:
%load_ext autoreload
%autoreload 2

import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '230321'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

In [None]:
today = '230321'

In [None]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").set_index("endpoint")
endpoints = endpoints_md.index.to_list()

In [None]:
# data_covariates = pd.read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/baseline_covariates_220503.feather")[["eid", "sex_f31_0_0"]].set_index("eid")
# data_covariates.head()
data_covariates = pd.read_feather(f"{experiment_path}/data_covariates_full.feather")[["eid", "sex"]].set_index("eid")

In [None]:
data_outcomes = pd.read_feather(f"{base_path}/data/data_outcomes_long_230320.feather").set_index("eid")
data_outcomes.head()

In [None]:
data_all = data_outcomes.merge(data_covariates, left_index=True, right_index=True, how="left").reset_index(drop=False).set_index("endpoint")

In [None]:
data_all.head()

In [None]:
data_dict = {e: df.reset_index(drop=True).set_index("eid") for e, df in data_all.groupby('endpoint')}
#data_dict.keys()

In [None]:
#endpoints

In [None]:
endpoints_md.sex.unique()

In [None]:
def get_eligible_eids(data_dict, endpoint):

    data_temp = data_dict[endpoint]
    eligibility = endpoints_md.loc[endpoint]["sex"]
    
    if eligibility == "Both": 
        eids_incl = data_temp.copy().query(f"prev==0").index.to_list()
    else:
        eids_incl = data_temp.copy().query(f"prev==0&sex==@eligibility").index.to_list()
        
    return {"endpoint": endpoint, 
            "n_eids": len(eids_incl), 
            "eid_list": eids_incl}

In [None]:
#long_endpoints_unique = data_outcomes['endpoint'].unique()
#endpoints_in_long = [e for e in endpoints if e in long_endpoints_unique]

d_list = [get_eligible_eids(data_dict, endpoint) for endpoint in tqdm(endpoints)] 
eid_df = pd.DataFrame.from_dict(d_list)

In [None]:
#eid_df.set_index("endpoint")["eid_list"].to_dict()

In [None]:
from datetime import date
today = str(date.today()) if today is None else today

In [None]:
f"{output_path}/eligable_eids_{today}.feather"

In [None]:
eid_df.to_feather(f"{experiment_path}/eligible_eids_{today}.feather") 

In [None]:
eid_df_long = eid_df[["endpoint", "eid_list"]].explode("eid_list").reset_index(drop=True)
eid_df_long.columns = ["endpoint", "eid"]
eid_df_long["endpoint"] = eid_df_long["endpoint"].astype("category")
eid_df_long["eid"] = eid_df_long["eid"].astype("category")

In [None]:
eid_df_long.to_feather(f"{experiment_path}/eligible_eids_long_{today}.feather")

In [None]:
eid_df_long