# This notebook uses Papermill to batch run models

In [1]:
%load_ext lab_black
import papermill as pm
import os
from multiprocessing import Pool

## Batch configurations

In [2]:
batch_name = "O2P_main"

# Create batch cfgs
batch_cfgs = []
i = 0

for noise in [0., 1., 2., 4., 8.]:
    for h in [25, 50, 100, 250]:
        for c_unit in [25, 50]:
            if c_unit == 25:
                for c_stage in [0, 195, 300]:
                    # Clean up unit = 25
                    i += 1
                    code_name = batch_name + "_model_{:04d}".format(i)
                    batch_cfg = dict(
                        sn=i,
                        in_notebook="basicOSP_master.ipynb",
                        code_name=code_name,
                        model_folder="models/" + code_name + "/",
                        out_notebook="models/" + code_name + "/output.ipynb",
                        params=dict(
                            code_name=code_name,
                            sample_name='hs04',
                            sample_rng_seed=329,
                            tf_rng_seed=123,
                            use_semantic=False,
                            sem_param_gf=0.,
                            sem_param_gi=0.,
                            sem_param_kf=0.,
                            sem_param_ki=0.,
                            sem_param_hf=0.,
                            sem_param_hi=0.,
                            o_input_dim=119,
                            hidden_units=h,
                            pho_units=250,
                            cleanup_units=c_unit,
                            rnn_activation='sigmoid',
                            regularizer_const=5e-6,
                            embed_attractor_cfg=
                            'models/Attractor_{}/model_config.json'.
                            format(c_unit),
                            embed_attractor_h5='ep{0:04d}.h5'.format(c_stage),
                            p_noise=noise,  # i.e. w_pp, w_pc, and w_cp noise
                            tau=0.2,
                            max_unit_time=4.,
                            n_mil_sample=.04,
                            batch_size=128,
                            learning_rate=0.005,
                            save_freq=1,
                            bq_dataset=batch_name
                        )
                    )
                    batch_cfgs.append(batch_cfg)

            if c_unit == 50:
                for c_stage in [0, 70, 125]:
                    i += 1
                    code_name = batch_name + "_model_{:04d}".format(i)
                    batch_cfg = dict(
                        sn=i,
                        in_notebook="basicOSP_master.ipynb",
                        code_name=code_name,
                        model_folder="models/" + code_name + "/",
                        out_notebook="models/" + code_name + "/output.ipynb",
                        params=dict(
                            code_name=code_name,
                            sample_name='hs04',
                            sample_rng_seed=329,
                            tf_rng_seed=123,
                            use_semantic=False,
                            sem_param_gf=0.,
                            sem_param_gi=0.,
                            sem_param_kf=0.,
                            sem_param_ki=0.,
                            sem_param_hf=0.,
                            sem_param_hi=0.,
                            o_input_dim=119,
                            hidden_units=h,
                            pho_units=250,
                            cleanup_units=c_unit,
                            rnn_activation='sigmoid',
                            regularizer_const=5e-6,
                            embed_attractor_cfg=
                            'models/Attractor_{}/model_config.json'.
                            format(c_unit),
                            embed_attractor_h5='ep{0:04d}.h5'.format(c_stage),
                            p_noise=noise,  # i.e. w_pp, w_pc, and w_cp noise
                            tau=0.2,
                            max_unit_time=4.,
                            n_mil_sample=.04,
                            batch_size=128,
                            learning_rate=0.005,
                            save_freq=1,
                            bq_dataset=batch_name
                        )
                    )
                    batch_cfgs.append(batch_cfg)


# Run
def run_batch(cfg):
    try:
        print("Running model {}".format(cfg['sn']))

        if not os.path.exists(cfg['model_folder']):
            os.mkdir(cfg['model_folder'])

        pm.execute_notebook(
            cfg['in_notebook'],
            cfg['out_notebook'],
            parameters=cfg['params'],
        )

    except:
        print("Error occur in {}".format(cfg['code_name']))

## Run batch

In [3]:
# Run in parallel pool
with Pool(12) as pool:
    pool.map(run_batch, batch_cfgs)

In [None]:
# Push results to BQ
from meta import model_cfg, connect_gbq
import pandas as pd
from tqdm import tqdm

# Make connection to bq
bq = connect_gbq()

for sn in tqdm(range(len(batch_cfgs))):

    model_folder = 'models/{0:s}_model_{1:04d}'.format(batch_name, sn + 1)

    # Load model config 
    cfg = model_cfg(None)
    cfg.load_cfg_json(model_folder + '/model_config.json')
    cfg.bq_dataset = batch_name

    # Load Strain and Grain
    strain_i_hist = pd.read_csv(model_folder + '/result_strain_item.csv')
    grain_i_hist = pd.read_csv(model_folder + '/result_grain_item.csv')

    bq.push_all(cfg, strain_i_hist, grain_i_hist)

#### Shutdown compute engine

In [None]:
from time import sleep
sleep(30)
!sudo poweroff  

## Compile results

In [3]:
import os, json
import pandas as pd
from meta import connect_gbq
from evaluate import vis
from tqdm import tqdm

conn = connect_gbq()
cfgs = conn.read_bq_cfg(batch_name)

# Read cfg files from BQ
print('===== Batch level hyperparams (columns that have >1 unique value) =====')
for i, x in enumerate(cfgs.columns):
    if not x == 'code_name':
        if not x == 'uuid':
            if len(cfgs[x].unique()) > 1:
                print(
                    'Column <{}> has these unique values: {}'.format(
                        x, cfgs[x].unique()
                    )
                )

# Parse each run by batch_eval, which aggregate item level data to condition level
# and merge Grain and Strain into one single file (Using local files instead of BQ,
# may use BQ for way way more data... >5Gbs I guess)

models_path = []
for i in range(len(cfgs)):
    models_path.append('models/' + batch_name + '_model_{0:04d}'.format(i + 1))

batch_acc = pd.DataFrame()

for i in tqdm(range(len(cfgs))):

    model_path = 'models/' + batch_name + '_model_{0:04d}'.format(i + 1)

    this_eval = vis(
        model_path, 'result_strain_item.csv', 'result_grain_item.csv'
    )  # Eval lesion and grain
    this_eval.parse_cond_df()
    batch_acc = pd.concat([batch_acc, this_eval.cdf], ignore_index=True)

df = pd.merge(batch_acc, cfgs, 'left', 'code_name')
print('Done')

Downloading: 100%|██████████| 120/120 [00:00<00:00, 213.98rows/s]
  0%|          | 0/120 [00:00<?, ?it/s]

===== Batch level hyperparams (columns that have >1 unique value) =====
Column <hidden_units> has these unique values: [ 25  50 100 250]
Column <cleanup_units> has these unique values: [25 50]
Column <embed_attractor_cfg> has these unique values: ['models/Attractor_25/model_config.json'
 'models/Attractor_50/model_config.json']
Column <embed_attractor_h5> has these unique values: ['ep0300.h5' 'ep0195.h5' 'ep0000.h5' 'ep0125.h5' 'ep0070.h5']
Column <w_pp_noise> has these unique values: [0. 1. 2. 8. 4.]
Column <w_pc_noise> has these unique values: [0. 1. 2. 8. 4.]
Column <w_cp_noise> has these unique values: [0. 1. 2. 8. 4.]


100%|██████████| 120/120 [02:12<00:00,  1.10s/it]


Done


In [5]:
def cal_attractor_acc(cleanup_units, embed_attractor_h5):
    if cleanup_units == 25:
        if embed_attractor_h5 == 'ep0000.h5':
            acc = 0.
        if embed_attractor_h5 == 'ep0195.h5':
            acc = 0.6
        if embed_attractor_h5 == 'ep0300.h5':
            acc = 0.9

    if cleanup_units == 50:
        if embed_attractor_h5 == 'ep0000.h5':
            acc = 0.
        if embed_attractor_h5 == 'ep0070.h5':
            acc = 0.6
        if embed_attractor_h5 == 'ep0125.h5':
            acc = 0.9

    return acc

In [6]:
tmp_acc = []

for i in df.index:
    tmp_acc.append(
        cal_attractor_acc(df.cleanup_units[i], df.embed_attractor_h5[i])
    )

df['attactor_acc'] = tmp_acc

# Save to h5 format
df.to_hdf('batch_eval/{}_cdf.h5'.format(batch_name), key='df', mode='w')

### Read from file

In [8]:
df = pd.read_hdf('batch_eval/{}_cdf.h5'.format(batch_name), 'df')

df_ind = df.loc[(df.timestep == df.timestep.max()) &
                (df.cond.isin(['INC_HF', 'ambiguous', 'unambiguous'])), [
                    'code_name', 'epoch', 'hidden_units', 'cleanup_units',
                    'w_pp_noise', 'attactor_acc', 'acc', 'exp'
                ]]

## Overview plot

In [44]:
pvt = df_ind.pivot_table(index=['code_name', 'epoch'],
                         columns='exp').reset_index()

plt_df = pd.DataFrame()
plt_df['code_name'] = pvt.code_name
plt_df['epoch'] = pvt.epoch
plt_df['nonword_acc'] = pvt.acc.grain
plt_df['word_acc'] = pvt.acc.strain
plt_df['hidden_units'] = pvt.hidden_units.strain
plt_df['w_pp_noise'] = pvt.w_pp_noise.strain
plt_df['hidden_units'] = pvt.hidden_units.strain
plt_df['hidden_units'] = pvt.hidden_units.strain

In [45]:
# Check all pivot align correctly
print(all(pvt.attactor_acc.grain == pvt.attactor_acc.strain))
print(all(pvt.cleanup_units.grain == pvt.cleanup_units.strain))
print(all(pvt.hidden_units.grain == pvt.hidden_units.strain))
print(all(pvt.w_pp_noise.grain == pvt.w_pp_noise.strain))

True
True
True
True


In [46]:
plt_df

Unnamed: 0,code_name,epoch,nonword_acc,word_acc,hidden_units,w_pp_noise
0,O2P_main_model_0001,1,0.000000,0.000,25,0.0
1,O2P_main_model_0001,2,0.000000,0.000,25,0.0
2,O2P_main_model_0001,3,0.000000,0.000,25,0.0
3,O2P_main_model_0001,4,0.000000,0.000,25,0.0
4,O2P_main_model_0001,5,0.008333,0.025,25,0.0
...,...,...,...,...,...,...
2875,O2P_main_model_0120,80,0.000000,0.000,250,8.0
2876,O2P_main_model_0120,85,0.000000,0.025,250,8.0
2877,O2P_main_model_0120,90,0.000000,0.025,250,8.0
2878,O2P_main_model_0120,95,0.033333,0.025,250,8.0


In [None]:
# Plotting results
import altair as alt

alt.data_transformers.enable("default")
alt.data_transformers.disable_max_rows()

# Selectors for interactions
sel_run = alt.selection(type="multi", on="click", fields=["code_name"])
sel_cond = alt.selection(
    type="multi", on="click", fields=["cond"], bind="legend"
)

# Heatmap for final epoch & timestep (Overview)
plot_timestep = df.timestep.max()
# plot_timestep = 19

# Plot strain

df_ov_strain = df[(df.epoch == df.epoch.max()) &
                  (df.timestep == plot_timestep) & (df.exp == 'strain')]

overview_strain = (
    alt.Chart(df_ov_strain).mark_rect().encode(
        x="hidden_units:O",
        y="w_pp_noise:O",
        column='attactor_acc',
        row='cleanup_units',
        color=alt.Color("acc", scale=alt.Scale(scheme="redyellowgreen")),
        opacity=alt.condition(sel_run, alt.value(1), alt.value(0.1)),
        tooltip=["code_name", "acc"],
    ).add_selection(sel_run)
)

df_ov_grain = df[(df.epoch == df.epoch.max()) & (df.timestep == plot_timestep) &
                 (df.exp == 'grain')]

overview_grain = (
    alt.Chart(df_ov_grain).mark_rect().encode(
        x="hidden_units:O",
        y="w_pp_noise:O",
        column='attactor_acc',
        row='cleanup_units',
        color=alt.Color("acc", scale=alt.Scale(scheme="redyellowgreen")),
        opacity=alt.condition(sel_run, alt.value(1), alt.value(0.1)),
        tooltip=["code_name", "acc"],
    ).add_selection(sel_run)
)

wnw_plot = (
    alt.Chart(plt_df).mark_line().encode(
        y=alt.Y("nonword_acc:Q", scale=alt.Scale(domain=(0, 1))),
        x=alt.X("word_acc:Q", scale=alt.Scale(domain=(0, 1))),
        color="epoch",
        opacity=alt.condition(sel_cond, alt.value(1), alt.value(0.1)),
        tooltip=["code_name", "word_acc", "nonword_acc"],
    ).transform_filter(sel_run).properties(
        title="Full model at final time step"
    )
)

wnw_plot

# Accuracy over epoch at last time step for selected model
last_time_point = df[df.timestep == df.timestep.max()]

acc_plot = (
    alt.Chart(last_time_point).mark_line().encode(
        y=alt.Y("acc:Q", scale=alt.Scale(domain=(0, 1))),
        x="epoch",
        color="cond",
        opacity=alt.condition(sel_cond, alt.value(1), alt.value(0.1)),
        tooltip=["code_name", "acc"],
    ).transform_filter(sel_run).properties(
        title="Word vs. Nonword at final time step"
    )
)
overview = overview_strain & overview_grain
mainplots = acc_plot & wnw_plot
plot = overview | mainplots
plot

### 

### Save plot

In [None]:
plot.save('batch_eval/O2P_l2reg.html')

## Eval one model

In [None]:
from meta import model_cfg
from evaluate import vis

code_name = '{0:}_model_{1:04d}'.format(batch_name, 47)

# Load cfg from json
cfg = model_cfg(None)
cfg.load_cfg_json('models/' + code_name + '/model_config.json')

vis = vis(
    cfg.path_model_folder, 'result_strain_item.csv', 'result_grain_item.csv'
)

vis.parse_cond_df()

full = vis.plot_dev_interactive('acc').properties(title='Full input')

full

### Flexible development plot

In [None]:
vis.plot_dev('acc', exp=None, condition='cond', timestep=2)

### Flexible time plot

In [None]:
vis.plot_time('acc', exp='strain', condition='cond', epoch=400)