# Benchmarks

## Initialize

In [None]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
import ray
ray.shutdown()

In [None]:
import glob
import os

list_of_files = glob.glob("/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/220531/coxph/models/*") # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)
print(latest_file)

In [None]:
!ls -t | head -n 5 "/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/220531/coxph/models"

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = 220531
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

In [None]:
import ray
#ray start --head --port=6379 --num-cpus 64 # in terminal
ray.init(address='auto')#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))
#ray.init(num_cpus=32)#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))

In [None]:
endpoints_md = pd.read_csv(f"{experiment_path}/endpoints.csv")
endpoints = sorted(endpoints_md.endpoint.to_list())

In [None]:
out_path = f"{experiment_path}/coxph/predictions"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [None]:
from sklearn.preprocessing import StandardScaler
import pickle
import zstandard

def read_data(fp_in):
    temp = pd.read_feather(f"{fp_in}").set_index("eid")
    return temp   
    
def save_pickle(data, data_path):
    with open(data_path, "wb") as fh:
        cctx = zstandard.ZstdCompressor()
        with cctx.stream_writer(fh) as compressor:
            compressor.write(pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL))
    
def read_predictions(endpoint, feature_set, partition):
    
    identifier = f"{endpoint}_{feature_set}_{partition}"
    fp_in = f"{out_path}/{identifier}.feather"
    
    temp = read_data(fp_in)
    return temp

In [None]:
d = []
for endpoint in tqdm(endpoints):
    for feature_set in ["Age+Sex", "MedicalHistory", "Age+Sex+MedicalHistory", "Age+Sex+MedicalHistory+I(Age*MH)"]:
        for partition in [i for i in range(22)]:
            try: 
                temp = read_predictions(endpoint, feature_set, partition)
                d.append({"endpoint": endpoint, "features":feature_set, "partition":partition, "available": True})
            except:
                d.append({"endpoint": endpoint, "features":feature_set, "partition":partition, "available": False})

In [None]:
pd.DataFrame.from_dict(d).groupby(["features"])["available"].sum().to_frame()