# Benchmark Models

## Initialize

In [7]:
%load_ext autoreload
%autoreload 2

import os
from tqdm.auto import tqdm
import pathlib
import datetime
import subprocess
import numpy as np
import pandas as pd
import lifelines

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '230323'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

today = experiment 

In [9]:
endpoint_defs = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").set_index("endpoint")
endpoints = endpoint_defs.index.to_list()

In [10]:
from scripts.benchmarks_iteration import load_data, calculate_iteration

In [11]:
os.environ['MKL_NUM_THREADS'] = "1"
os.environ['NUMEXPR_NUM_THREADS'] = "1"
os.environ['OMP_NUM_THREADS'] = "1"

In [12]:
rows_ray = []
iterations = [i for i in range(100)]
model = "RetinaUKB"
t_eval = 10
for iteration in tqdm(iterations[:10]):
    output_path, experiment_path, in_path, out_path, endpoints, scores, prediction_paths, eids_dict = load_data()
    for endpoint in tqdm(endpoints): 
        eids_e = eids_dict[endpoint]

        #ds = calculate_iteration.remote(in_path, prediction_paths, endpoint, scores, partition, model, t_eval, iteration, eids_e, output_path) #ray

        ds = calculate_iteration(in_path, prediction_paths, endpoint, scores, model, t_eval, iteration, eids_e, output_path) #ray
        rows_ray.append(ds)

        del eids_e

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

In [13]:
#rows = [ray.get(r) for r in tqdm(rows_ray)] # ray
rows = rows_ray # not ray
rows_finished = [item for sublist in rows for item in sublist]
benchmark_endpoints = pd.DataFrame(rows_finished)

benchmark_endpoints.to_feather(f"{experiment_path}/benchmarks_230323.feather")

In [14]:
benchmark_endpoints.groupby("score")["cindex"].mean().to_frame("cindex").sort_values("cindex")

Unnamed: 0_level_0,cindex
score,Unnamed: 1_level_1
Retina,0.558892
SCORE2,0.59172
Age+Sex,0.592812
ASCVD,0.593736
Age+Sex+Retina,0.600319
SCORE2+Retina,0.600747
ASCVD+Retina,0.602278


In [14]:
benchmark_endpoints.groupby("score")["cindex"].mean().to_frame("cindex").sort_values("cindex")

Unnamed: 0_level_0,cindex
score,Unnamed: 1_level_1
Retina,0.550327
Age+Sex,0.592258
SCORE2,0.593263
ASCVD,0.595019
Age+Sex+Retina,0.59846
SCORE2+Retina,0.600798
ASCVD+Retina,0.602217


## Submit Benchmark jobs

In [None]:
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_configs", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs", exist_ok=True)

In [None]:
def make_job_script(user, job_name, iteration, model, partition):

    job_script_str = (
        f"""#!/bin/bash

#SBATCH --job-name={job_name}  # Specify job name
#SBATCH --ntasks 1 
#SBATCH --cpus-per-task 16
#SBATCH --mem=75G              # Specify number of nodes
#SBATCH --time=5:00:00        # Set a limit on the total run time

source ~/miniconda3/etc/profile.d/conda.sh
{ACTIVATE_ENV_CMD}

#ray start --head --port=6378 --num-cpus 16
python {TRAIN_SCRIPT} --iteration {iteration} --model {model} --partition {partition}"""
            )

    return job_script_str


In [None]:
def submit(path, job_name, job_script, time_stamp=None):
    if not time_stamp:
        time_stamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    script_path_long = f"{path}/{job_name}_{time_stamp}.sh"

    with open(script_path_long, "w") as outfile:
        outfile.write(job_script)
    script_path = f"{path}/{job_name}.sh"
    try:
        os.unlink(script_path)
    except FileNotFoundError:  # because we cannot overwrite symlinks directly
        pass
    os.symlink(os.path.realpath(script_path_long), script_path)

    output_path = f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs/{job_name}"

    print(job_script)
    print("\n\nSubmission:\n===========\n")
    sub_cmd = (
        f"sbatch --error={output_path}_%j_stderr.out --output={output_path}_%j_stdout.out <"
        f" {script_path}"
    )
    print(sub_cmd)

    ret = subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), capture_output=True)
    print(ret.stdout.decode())

In [None]:
# HIER AENDERN:
# Thore: range(0,10) + range(10,25)
# Lukas: range(25,50)
# Ben: range(50, 75)
# Jakob: range(75, 100)

iterations = [i for i in range(100,1000)] #10,100, # 100,1000
# iterations = [82, 84, 86, 88, 92]
partitions = [i for i in range(22)]
models = ['ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66', 
#               'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.5', 
#               'ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.8'
             ]

In [None]:
import time

jobids = []
for iteration in iterations:
    for model in models:
        for partition in partitions:
            job_name = f"{iteration}_{model}_{partition}_{JOBNAME}"

            job_script = make_job_script(user=USER, job_name=job_name, iteration=iteration, model=model, partition=partition) # partition currently not used in script

            jobid = submit(
                path=f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions",
                job_name=job_name,
                job_script=job_script,
            )

            jobids.append(jobid)

print(jobids)

## Check progress

In [4]:
base_path = "/sc-projects/sc-proj-ukb-cvd"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = '221108'
experiment_path = f"{output_path}/{experiment}"
experiment_path

/sc-projects/sc-proj-ukb-cvd


'/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108'

In [5]:
today = '221109'

In [6]:
from datetime import date
today = str(date.today()) if today is None else today

In [7]:
from pathlib import Path
benchmark_paths = list(Path(f"{experiment_path}/benchmarks/{today}").rglob('*.feather'))

benchmarks_df = pd.concat([pd.read_feather(p) for p in benchmark_paths], axis=0)

In [13]:
benchmarks_df.value_counts(["iteration"]).to_frame().sort_values("iteration")

Unnamed: 0_level_0,0
iteration,Unnamed: 1_level_1
0,3513
1,3513
2,3513
3,3513
4,3513
...,...
124,3513
125,3513
126,3513
127,3513


In [14]:
# cap the iterations to 100
its = [i for i in range(0,100)]

benchmarks_df = benchmarks_df.query('iteration==@its')

In [15]:
missing = [i for i in range(0, 100) if i not in benchmarks_df["iteration"].unique()]
missing

[]

In [16]:
Path(f"{experiment_path}/benchmarks/{today}")

PosixPath('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/221108/benchmarks/221109')

In [17]:
benchmarks_df.reset_index(drop=True).to_feather(f"{experiment_path}/benchmarks_cindex_{today}.feather")

In [18]:
benchmarks_df.groupby(["score"]).mean("cindex").sort_values("cindex")

Unnamed: 0_level_0,iteration,time,cindex
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Age+Sex,49.5,10.0,0.604202
Retina,49.5,10.0,0.605547
Age+Sex+Retina,49.5,10.0,0.617725


# old stuff

In [None]:
%env MKL_NUM_THREADS=1
%env NUMEXPR_NUM_THREADS=1
%env OMP_NUM_THREADS=1

In [None]:
ray.shutdown()

In [None]:
import ray
ray.init(num_cpus=24)#, webui_url="0.0.0.0")

In [None]:
in_path = f"{experiment_path}/coxph/predictions"

In [None]:
prediction_paths = !ls $in_path
predictions = pd.Series(prediction_paths).str.split("_", expand=True)\
    .assign(path = prediction_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(partition = lambda x: x[3].str.replace(".feather", "", regex=True).astype(int))\
    [["endpoint", "score", "partition", "path"]].sort_values(["endpoint", "score", "partition"]).reset_index(drop=True)
predictions

In [None]:
import pandas as pd
all_endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

#all_endpoints = sorted(endpoints_all_md.endpoint.to_list())
print(len(all_endpoints))

endpoints_not_overlapping_with_preds = []
#endpoints_not_overlapping_with_preds_md = pd.read_csv(f"{experiment_path}/endpoints_not_overlapping.csv", header=None)
#print(len(endpoints_not_overlapping_with_preds_md))
#endpoints_not_overlapping_with_preds = list(endpoints_not_overlapping_with_preds_md[0].values)

endpoints = []
for c in all_endpoints:
    if c not in endpoints_not_overlapping_with_preds: # this is what i want
        #print('OK    - ',c)
        endpoints.append(c)
    #if c in endpoints_not_overlapping_with_preds: # this is what causes errors!
    #    print('ERROR - ',c)
print(len(endpoints))

In [None]:
scores = ['Age+Sex', 'Retina', 'Age+Sex+Retina']
partitions = sorted(predictions.partition.unique().tolist())

In [None]:
from datetime import date
today = str(date.today())

In [None]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_{today}.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

In [None]:
data_outcomes = pd.read_feather(
    f"{output_path}/baseline_outcomes_220531.feather", 
    columns= ["eid"] + [f"{e}_event" for e in endpoints] + [f"{e}_time" for e in endpoints])\
    .set_index("eid")

In [None]:
eids = data_outcomes.index.values

In [None]:
def read_partitions(endpoint, score, time):
    paths = predictions.query("endpoint==@endpoint").query("score==@score").path.to_list()
    data_preds = pd.concat([pd.read_feather(f"{in_path}/{path}", columns=["eid", f"Ft_{time}"]) 
                      for path in paths], axis=0).set_index("eid").sort_index()
    data_preds.columns = ["Ft"]
    return data_preds

In [None]:
def prepare_data(data_outcomes, endpoint, score, t_eval):
    temp_preds = read_partitions(endpoint, score, t_eval)
    temp_tte = data_outcomes[[f"{endpoint}_event", f"{endpoint}_time"]]
    temp_tte.columns = ["event", "time"]
    temp_data = temp_preds.merge(temp_tte, left_index=True, right_index=True, how="left")
    
    condition = (temp_data['event'] == 0) | (temp_data['time'] > t_eval)
    
    temp_data["event"] = (np.where(condition, 0, 1))
    
    temp_data["time"] = (np.where(condition, t_eval, temp_data['time']))
    return temp_data

In [None]:
from lifelines.utils import concordance_index

def calculate_cindex(data_outcomes, endpoint, score, time, iteration, eids_i):  
    temp_data = prepare_data(data_outcomes, endpoint, score, time)
    temp_data = temp_data[temp_data.index.isin(eids_i)]
    
    try:
        cindex = 1-concordance_index(temp_data["time"], temp_data["Ft"], temp_data["event"])
    except ZeroDivisionError: 
        cindex=np.nan
    return {"endpoint":endpoint, "score": score, "iteration": iteration, "time":time, "cindex":cindex}

@ray.remote
def calculate_iteration(data_outcomes, endpoint, score, time, iteration, eids_i):  
    dicts = []
    for score in scores:
        dicts.append(calculate_cindex(data_outcomes, endpoint, score, 10, iteration, eids_i))
    return dicts

In [None]:
iterations=[i for i in range(1000)] # 100

In [None]:
out_path = f"{experiment_path}/benchmarks"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [None]:
ray_outcomes = ray.put(data_outcomes)

rows_ray = []
for endpoint in tqdm(endpoints):
    eids_e = eids_dict[endpoint]
    for iteration in iterations:
        eids_i = np.random.choice(eids_e, size=len(eids_e))
        ds = calculate_iteration.remote(ray_outcomes, endpoint, scores, 10, iteration, eids_i)
        rows_ray.append(ds)

In [None]:
rows = [ray.get(r) for r in tqdm(rows_ray)]

In [None]:
rows_finished = [item for sublist in rows for item in sublist]

In [None]:
benchmark_endpoints = pd.DataFrame({}).append(rows_finished, ignore_index=True)

In [None]:
name = f"benchmark_cindex_agesexcoxph_{today}"
benchmark_endpoints.to_feather(f"{experiment_path}/{name}.feather")

In [None]:
print(f"{experiment_path}/{name}")

In [None]:
len(rows_finished), len(rows)

In [None]:
pd.DataFrame({}).append(rows_finished, ignore_index=True).to_feather(f"{out_path}/{endpoint}.feather")

In [None]:
%%time
temp_data = prepare_data(data_outcomes, endpoint, score, 10)

In [None]:
# cancel jobs


for i in range(528114, 528500):
    sub_cmd =  f"scancel {i}"
    subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), capture_output=False)
    
    