In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import glob
import os
import matplotlib.pyplot as plt
import shutil
from prediction_utils.util import df_dict_concat, yaml_read, yaml_write

In [2]:
project_dir = "/share/pi/nigam/projects/spfohl/cohorts/admissions/optum"
os.listdir(os.path.join(project_dir, 'experiments'))

['baseline_tuning_fold_1_10',
 'baseline_tuning_fold_1',
 'fair_tuning_fold_1_10']

In [3]:
experiment_name = 'baseline_tuning_fold_1'

In [4]:
baseline_files = glob.glob(
    os.path.join(
        project_dir, 
        'experiments', 
        experiment_name, 
        '**', 
        'result_df_training_eval.parquet'
    ),
    recursive=True
)

In [5]:
baseline_df_dict = {
    tuple(file_name.split('/'))[-4:-1]: pd.read_parquet(file_name)
    for file_name in baseline_files
}
baseline_df = df_dict_concat(baseline_df_dict, 
                             ['task', 'config_filename', 'fold']
                            )

In [6]:
baseline_df.head()

Unnamed: 0,task,config_filename,fold,metric,phase,epoch,performance
0,LOS_7,8.yaml,1,auc,val,0,0.801082
1,LOS_7,8.yaml,1,auprc,val,0,0.494147
2,LOS_7,8.yaml,1,brier,val,0,0.123136
3,LOS_7,8.yaml,1,loss_bce,val,0,0.386069
4,LOS_7,8.yaml,1,loss,val,0,0.386069


In [7]:
mean_performance = (
    pd.DataFrame(
        baseline_df
        .query('metric == "loss" & phase == "val"')
        .groupby(['config_filename', 'task'])
        .agg(performance=('performance', 'mean'))
        .reset_index()
    )
)
best_model = (
    mean_performance
    .groupby('task')
    .agg(performance=('performance','min'))
    .merge(mean_performance)
    
)
 
# mean_performance

In [8]:
# mean_performance = (
#     pd.DataFrame(
#         baseline_df
#         .query('metric == "loss" & phase == "val"')
#         .groupby(['config_filename', 'task'])
#         .agg({'performance': 'mean', 'config_filename': lambda x: x.array[-1], 'task': lambda x: x.array[-1]})
#         .reset_index(drop=True)
#     )
# )

# best_model = pd.DataFrame(mean_performance.groupby(['task']).performance.agg('min')).reset_index().merge(mean_performance)
# best_model

best_model_config_df = best_model[['config_filename', 'task']]
best_model_performance = baseline_df.merge(best_model_config_df)

In [9]:
best_model_performance

Unnamed: 0,task,config_filename,fold,metric,phase,epoch,performance
0,LOS_7,2.yaml,1,auc,val,0,0.802281
1,LOS_7,2.yaml,1,auprc,val,0,0.495219
2,LOS_7,2.yaml,1,brier,val,0,0.122919
3,LOS_7,2.yaml,1,loss_bce,val,0,0.385209
4,LOS_7,2.yaml,1,loss,val,0,0.385209
5,LOS_7,2.yaml,1,auc,test,0,0.802328
6,LOS_7,2.yaml,1,auprc,test,0,0.49613
7,LOS_7,2.yaml,1,brier,test,0,0.122795
8,LOS_7,2.yaml,1,loss_bce,test,0,0.384662
9,LOS_7,2.yaml,1,loss,test,0,0.384662


In [10]:
best_model_performance[['task', 'config_filename']].drop_duplicates()

Unnamed: 0,task,config_filename
0,LOS_7,2.yaml
10,readmission_30,2.yaml


In [11]:
best_model_config_df

Unnamed: 0,config_filename,task
0,2.yaml,LOS_7
1,2.yaml,readmission_30


In [12]:
baseline_df

Unnamed: 0,task,config_filename,fold,metric,phase,epoch,performance
0,LOS_7,8.yaml,1,auc,val,0,0.801082
1,LOS_7,8.yaml,1,auprc,val,0,0.494147
2,LOS_7,8.yaml,1,brier,val,0,0.123136
3,LOS_7,8.yaml,1,loss_bce,val,0,0.386069
4,LOS_7,8.yaml,1,loss,val,0,0.386069
...,...,...,...,...,...,...,...
995,readmission_30,49.yaml,1,auc,test,0,0.764664
996,readmission_30,49.yaml,1,auprc,test,0,0.243207
997,readmission_30,49.yaml,1,brier,test,0,0.069601
998,readmission_30,49.yaml,1,loss_bce,test,0,0.250835


In [13]:
base_config_path = os.path.join(project_dir, 'experiments', experiment_name, 'config')
# retrain_experiment_name = 'baseline_best'
selected_config_path = os.path.join(project_dir, 'experiments', experiment_name, 'config', 'selected_models')

In [14]:
# Write to a new directory
for i, row in best_model_config_df.iterrows():
    the_config = yaml_read(os.path.join(base_config_path, row.task, row.config_filename))
    print(the_config)
    the_config['label_col'] = row.task
    os.makedirs(os.path.join(selected_config_path, row.task), exist_ok=True)
    yaml_write(the_config, os.path.join(selected_config_path, row.task, row.config_filename))

{'batch_size': 512, 'drop_prob': 0.25, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 128, 'label_col': 'LOS_7', 'lr': 1e-05, 'num_epochs': 150, 'num_hidden': 3}
{'batch_size': 512, 'drop_prob': 0.25, 'early_stopping': True, 'early_stopping_patience': 10, 'gamma': 1.0, 'hidden_dim': 128, 'label_col': 'readmission_30', 'lr': 1e-05, 'num_epochs': 150, 'num_hidden': 3}
