# Benchmarks

## Initialize

In [None]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [None]:
import ray
ray.shutdown()

In [None]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '230321'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(10)]

In [None]:
in_path = pathlib.Path(f"{experiment_path}/loghs")

In [None]:
models = [f.name for f in in_path.iterdir() if f.is_dir() and "ipynb_checkpoints" not in str(f)]
models

In [None]:
from sklearn.preprocessing import StandardScaler
import pickle
import zstandard

def read_data(fp_in, split):
    temp = pd.read_feather(f"{fp_in}/{split}.feather").set_index("eid")
    return temp   
    
def save_pickle(data, data_path):
    with open(data_path, "wb") as fh:
        cctx = zstandard.ZstdCompressor()
        with cctx.stream_writer(fh) as compressor:
            compressor.write(pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL))
    
def read_predictions(model, partition, split):

    fp_in = f"{in_path}/{model}/{partition}"
    
    if pathlib.Path(fp_in).is_dir(): 
        temp = read_data(fp_in, split)
        return temp
    else:
        print(fp_in)
        raise NotImplementedError()

In [None]:
for partition in partitions: # test: in [0, 10, 21]
    for split in ["train"]: # "test_left", 'test_right'
        temp = read_predictions(models[0], partition, split)
        print(partition, split, (temp.isna().sum() > 0).sum())