# Fit Cox Models

## Initialize

In [None]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import clear_output
import ray
import datetime
import subprocess
import warnings
import lifelines
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

#experiment = '230629'
#experiment_path = f"{project_path}/{experiment}"
#pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

In [None]:
os.environ['MKL_NUM_THREADS'] = "1"
os.environ['NUMEXPR_NUM_THREADS'] = "1"
os.environ['OMP_NUM_THREADS'] = "1"

In [None]:
from scripts.coxph_fit_partition import load_data, fit_endpoint

In [None]:
partitions

In [None]:
for partition in tqdm(partitions):
    eids_dict, score_defs, endpoint_defs, endpoints, models, model_path, experiment_path, data_partition = load_data(partition)
    # setup ray and put files in plasma storage
    #ray.init(num_cpus=24) # crashes if num_cpus > 16, why not more possible?
    ray_eids = ray.put(eids_dict)
    ray_score_defs = ray.put(score_defs)
    ray_endpoint_defs = ray.put(endpoint_defs)
    ray_partition = ray.put(data_partition)
    # fit cox models via ray
    progress = []
    for endpoint in endpoints:
        progress.append(fit_endpoint.remote(ray_partition, ray_eids, ray_score_defs, ray_endpoint_defs, endpoint, partition, models, model_path, experiment_path))
    [ray.get(s) for s in tqdm(progress)]

## Check progress

In [None]:
cox_paths = os.listdir(f"{experiment_path}/coxph/models/")
#len(cox_paths), cox_paths[0]
path_df = pd.DataFrame(data = [p[:-2] for p in cox_paths]).rename(columns={0:"path"})
print(path_df.head())
path_df[["endpoint_1", "endpoint_2", "score", "model", "partition"]] = path_df["path"].str.split("_", expand=True)

In [None]:
experiment_path

In [None]:
experiment_path

In [None]:
cox_paths

In [None]:
path_df["endpoint"] = path_df["endpoint_1"] + "_" + path_df["endpoint_2"] 

path_df.value_counts(["partition"]).to_frame()