In [1]:
import pandas as pd
import json
from IPython.display import display

In [25]:
def load_data_to_df(filepath):
    with open(filepath, 'rb') as f:
        res = json.load(f)
    res = res['results']

    df = pd.concat([pd.DataFrame(r) for r in res], axis=0, keys=range(len(res)))
    df = pd.concat([df, df['hyperparam'].apply(pd.Series)], axis=1).drop(columns='hyperparam')
    df = df.rename_axis(['rollout', 'config_id'], axis='index')
    return df


def get_mean_and_sem(df, test_metric='test_risk', val_metric='val_loss', hparam_config=None):    
    # Select results for specific hparams
    if hparam_config:
        for key, val in hparam_config.items():
            df = df[df[key] == val]
        
    # Select best hparams
    df = df.loc[df.groupby('rollout')[val_metric].idxmin()]
    return df[test_metric].mean(), df[test_metric].sem()


# Best hparam configs
def get_best_hparam_results(df, metric='val_loss', num_best=5):
    best = df.groupby('config_id').mean().sort_values(by=metric)[:num_best]
    return best

In [7]:
# Merge datasets
def load_and_merge_datasets(filepaths, property_dict=None, merge='hparam_configs'):
    if isinstance(filepaths, str):
        filepaths = [filepaths]
    
    if merge == 'rollouts':
        merge_property = 'rollout'
    elif merge == 'hparam_configs':
        merge_property = 'config_id'
    else:
        raise NotImplementedError
    
    if not property_dict:
        prop_name = 'version'
        vals = range(len(filepaths))
    else:
        prop_name = list(property_dict.keys())[0]
        vals = property_dict[prop_name]
        
    start_merge_id = 0
    dfs = []
        
    for filepath, prop in zip(filepaths, vals):
        data_frame = load_data_to_df(filepath)
        data_frame = data_frame.drop(columns=['test_risk_optim', 'parameter_mse_optim', 'best_index'])
        data_frame[prop_name] = prop
        
        # Add new config id
        data_frame = data_frame.reset_index()
        data_frame[merge_property] += start_merge_id
        start_merge_id = data_frame[merge_property].max() + 1
        dfs.append(data_frame)

    df = pd.concat(dfs, ignore_index=True)
    df = df.set_index(['rollout', 'config_id'])
    return df

In [4]:
filepath1 = '../results/bennet_hetero/bennet_hetero_method=KMM-RF_n=2000.json'
filepath2 = '../results/bennet_hetero/bennet_hetero_method=KMM-RF_n=2000.json'
fps = [filepath1, filepath2]
property_dict = {'divergence': ['kl', 'log']}

df1 = load_data_to_df(filepath1)
df2 = load_data_to_df(filepath2)
df3 = load_and_merge_datasets(fps, property_dict)

print(get_mean_and_sem(df1))
print(get_mean_and_sem(df2))
print(get_mean_and_sem(df3))

best = get_best_hparam_results(df3, metric='test_risk')
best

NameError: name 'filepath2' is not defined

In [26]:
df = load_and_merge_datasets('../results/bennet_hetero/bennet_hetero_method=KMM-RF_n=2000.json')
best = get_best_hparam_results(df, metric='test_risk', num_best=20)
best

dict_keys(['results_summarized', 'results'])


Unnamed: 0_level_0,test_risk,mse,val_loss,kl_reg_param,reg_param,version
config_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,0.23828,0.249667,0.862316,10.0,1.0,0
4,0.638901,1.124823,0.931961,10.0,0.01,0
3,0.773928,1.370843,0.946644,10.0,0.0001,0
2,0.840228,1.555402,0.939794,1.0,1.0,0
1,0.980081,1.811482,0.96504,1.0,0.01,0
0,0.985541,1.820536,0.965894,1.0,0.0001,0


In [17]:
df.loc[19]

Unnamed: 0_level_0,test_risk,mse,val_loss,kl_reg_param,reg_param,version
config_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.324993,2.279449,1.033055,1.0,0.0001,0
1,1.309273,2.264011,1.037385,1.0,0.01,0
2,1.319924,2.283681,1.0199,1.0,1.0,0
3,1.301057,2.224433,1.029386,10.0,0.0001,0
4,1.279789,2.206945,1.02817,10.0,0.01,0
5,0.447728,0.464285,0.928658,10.0,1.0,0
