# Benchmark Models

In [None]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output
import ray

import warnings
import lifelines
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = 230425
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

In [None]:
in_path = f"{experiment_path}/coxph/predictions"
prediction_paths = !ls $in_path
predictions = pd.Series(prediction_paths).str.split("_", expand=True)\
    .assign(path = prediction_paths)\
    .assign(endpoint = lambda x: x[0]+"_"+x[1])\
    .assign(score = lambda x: x[2])\
    .assign(partition = lambda x: x[3].str.replace(".feather", "", regex=True).astype(int))\
    [["endpoint", "score", "partition", "path"]].sort_values(["endpoint", "score", "partition"]).reset_index(drop=True)
predictions

In [None]:
predictions.score.unique()

In [None]:
predictions.to_feather(f"{experiment_path}/prediction_paths.feather")

## Submit Benchmark jobs

In [None]:
# %%
import datetime
import itertools
import os
import pathlib
import re
import subprocess

import pandas as pd
from omegaconf import OmegaConf
from tqdm.auto import tqdm

# %% codecell
USER = "USER"  # Anonymized
BASE = pathlib.Path(f"/home/{USER}/code/")

EXPERIMENT_NAME = "22_medhistory"  # name under which to store the generated .sh scripts and yamls
TEMPLATE_CONFIG = f"{BASE}/config/"  # template yaml to use
TRAIN_SCRIPT = f"{BASE}/MedicalHistoryPhenomeWide/2_downstream_processing/10_benchmarks_iteration.py"
ACTIVATE_ENV_CMD = """mamba activate ehrgraphs"""

TAG = 230425
JOBNAME = "benchmarks"

In [None]:
TRAIN_SCRIPT

In [None]:
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_configs", exist_ok=True)
os.makedirs(f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs", exist_ok=True)

In [None]:
def make_job_script(user, job_name, iteration):

    job_script_str = (
        f"""#!/bin/bash

#SBATCH --job-name={job_name}  # Specify job name
#SBATCH --nodes=1              # Specify number of nodes
#SBATCH --mem=485G              # Specify number of nodes
#SBATCH --time=1:00:00        # Set a limit on the total run time
#SBATCH --tasks-per-node=1
#SBATCH --exclusive

{ACTIVATE_ENV_CMD}

ray start --head --port=6378 --num-cpus 32
python {TRAIN_SCRIPT} --iteration {iteration}"""
            )

    return job_script_str


In [None]:
def submit(path, job_name, job_script, time_stamp=None):
    if not time_stamp:
        time_stamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

    script_path_long = f"{path}/{job_name}_{time_stamp}.sh"

    with open(script_path_long, "w") as outfile:
        outfile.write(job_script)
    script_path = f"{path}/{job_name}.sh"
    try:
        os.unlink(script_path)
    except FileNotFoundError:  # because we cannot overwrite symlinks directly
        pass
    os.symlink(os.path.realpath(script_path_long), script_path)

    output_path = f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_outputs/{job_name}"

    print(job_script)
    print("\n\nSubmission:\n===========\n")
    sub_cmd = (
        f"sbatch --error={output_path}_%j_stderr.out --output={output_path}_%j_stdout.out <"
        f" {script_path}"
    )
    print(sub_cmd)

    ret = subprocess.run(sub_cmd, shell=True, cwd=os.getcwd(), capture_output=True)
    print(ret.stdout.decode())

In [None]:
iterations = [i for i in range(1, 100)]

In [None]:
import time

jobids = []
for iteration in iterations:
    
    job_name = f"{iteration}_{JOBNAME}"

    job_script = make_job_script(user=USER, job_name=job_name, iteration=iteration)

    jobid = submit(
        path=f"/home/{USER}/tmp/{EXPERIMENT_NAME}/job_submissions",
        job_name=job_name,
        job_script=job_script,
    )

    jobids.append(jobid)

print(jobids)

## Check progress

In [None]:
from pathlib import Path
benchmark_paths = paths = list(Path("/sc-projects/sc-proj-ukb-cvd/results/projects/22_medical_records/data/230425/benchmarks/").rglob('*.feather'))

benchmarks_df = pd.concat([pd.read_feather(p) for p in benchmark_paths], axis=0)

benchmarks_df.value_counts(["iteration"]).to_frame().sort_values("iteration")

In [None]:
benchmarks_df.reset_index(drop=True).to_feather(f"{experiment_path}/benchmarks_cindex_230425.feather")

In [None]:
benchmarks_df.groupby(["score"]).mean("cindex").sort_values("cindex")

In [None]:
benchmarks_df.groupby(["score"]).mean("cindex").sort_values("cindex")