# Batch run models
1. Running model
2. Evaluating results

In [None]:
%load_ext lab_black
import os, itertools, json, multiprocessing
import pandas as pd
import altair as alt
import papermill as pm
from time import sleep
from meta import model_cfg, batch_cfgs_to_df, parse_batch_results, check_cfgs_params
from evaluate import make_df_wnw

## Run batch
Make configs

In [None]:
# import random
# seeds = [int(random.random() * 1e5) for x in range(10)]

batch_name = "O2P_rr2019"

batch_output_dir = 'batch_eval/{}/'.format(batch_name)
os.makedirs(batch_output_dir, exist_ok=True)

param_grid = {
    'p_noise': [0., 1., 2., 3.],
    'hidden_units': [50, 100, 150, 200],
    'learning_rate': [.001, .005, .01],
    'cleanup_units': [10, 50]
}

static_hpar = {
    'sample_name': 'jay',
    'rng_seed': 4321,
    'use_semantic': False,
    'input_dim': 119,
    'output_dim': 250,
    'use_attractor': False,
    'rnn_activation': 'sigmoid',
    'regularizer_const': None,
    'w_initializer': 'glorot_uniform',
    'tau': 0.2,
    'max_unit_time': 4.,
    'optimizer': 'adam',
    'n_mil_sample': 1.,
    'batch_size': 1,
    'save_freq': 10,
    'bq_dataset': batch_name
}

# Check duplicate keys
for key in static_hpar.keys():
    if key in param_grid.keys():
        raise ValueError('Key duplicate: {}'.format(key))

# Iterate and create batch level super object: batch_cfgs
batch_cfgs = []
varying_hpar_names, varying_hpar_values = zip(*param_grid.items())
for i, v in enumerate(itertools.product(*varying_hpar_values)):
    code_name = batch_name + "_r{:04d}".format(i)

    this_hpar = dict(zip(varying_hpar_names, v))
    this_hpar.update(static_hpar)

    # Add identifier params into param dict
    this_hpar['code_name'] = code_name

    # Pass into model_cfg to catch error early
    model_cfg(**this_hpar)

    batch_cfg = dict(
        sn=i,
        in_notebook="OSP_master.ipynb",
        code_name=code_name,
        model_folder="models/" + code_name + "/",
        out_notebook="models/" + code_name + "/output.ipynb",
        params=this_hpar
    )

    batch_cfgs.append(batch_cfg)

# Save cfgs
with open(batch_output_dir + 'batch_config.json', 'w') as f:
    json.dump(batch_cfgs, f)
    
n = len(batch_cfgs)
print('There are {} models in this batch'.format(n))

Parallel run

In [None]:
# Run
def run_batch(cfg):
    """
    Using papermill to run parameterized notebook
    """
    print("Running model {}".format(cfg['sn']))
    os.makedirs(cfg['model_folder'], exist_ok=True)
    pm.execute_notebook(
        cfg['in_notebook'],
        cfg['out_notebook'],
        parameters=cfg['params'],
    )


# Run in parallel pool
with multiprocessing.Pool(4) as pool:
    pool.map(run_batch, batch_cfgs)

Compile and save results

In [None]:
df = parse_batch_results(cfgs)
df.to_csv(batch_output_dir + 'bcdf.csv')

Shutdown compute engine

In [None]:
send_mail(batch_name)
sleep(30)
!sudo poweroff  

In [None]:
tmp = model_cfg('models/')

## Plotting

Review the batch structure

In [None]:
check_cfgs_params(cfgs)

Create re-useable overview heatmap and word vs. nonword df

In [None]:
alt.data_transformers.enable("default")
alt.data_transformers.disable_max_rows()

# Selectors for interactions
sel_run = alt.selection(type="multi", on="click", fields=["code_name"])
sel_cond = alt.selection(
    type="multi", on="click", fields=["cond"], bind="legend"
)

# df for overview
df_ov = df[(df.epoch == df.epoch.max()) & (df.timestep == df.timestep.max())]

# Shared master over-view
overview = (
    alt.Chart(df_ov).mark_rect().encode(
        x="p_noise:O",
        y="hidden_units:O",
        row="learning_rate:O",
        column="cleanup_units:O",
        color=alt.Color("acc", scale=alt.Scale(scheme="redyellowgreen")),
        opacity=alt.condition(sel_run, alt.value(1), alt.value(0)),
        tooltip=["code_name", "acc"],
    ).add_selection(sel_run).properties(title="Overall accuracy")
)

# Accuracy Word (HF-INC) vs. Nonwords
df_wnw = make_df_wnw(df, selected_cond=['INC_HF', 'ambiguous', 'unambiguous'])

Single run plots

In [None]:
# Accuracy over epoch at last time step for selected model
df_laststep = df[df.timestep == df.timestep.max()]

acc_plot = (
    alt.Chart(df_laststep).mark_line(point=True).encode(
        y=alt.Y("acc:Q", scale=alt.Scale(domain=(0, 1))),
        x="epoch",
        color="cond",
        opacity=alt.condition(sel_cond, alt.value(1), alt.value(0)),
        tooltip=["code_name", "acc"],
    ).add_selection(sel_cond).transform_filter(sel_run).properties(
        title="Full model at final time step"
    )
)

wnw_plot = (
    alt.Chart(df_wnw).mark_point().encode(
        y=alt.Y("nonword_acc:Q", scale=alt.Scale(domain=(0, 1))),
        x=alt.X("word_acc:Q", scale=alt.Scale(domain=(0, 1))),
        color=alt.Color("epoch", scale=alt.Scale(scheme="redyellowgreen")),
        tooltip=["epoch", "word_acc", "nonword_acc"],
    ).transform_filter(sel_run).properties(
        title="Word vs. Nonword accuracy at final time step"
    )
)

# Plot diagonal
diagline = alt.Chart(pd.DataFrame({
    'x': [0, 1],
    'y': [0, 1]
})).mark_line(color='black').encode(x='x', y='y')

wnw_with_diag = wnw_plot + diagline

# overview = overview_strain & overview_grain
mainplots = acc_plot & wnw_with_diag
splot = overview | mainplots

splot.save(batch_output_dir + 'single_run.html')
splot

Multi runs plots

In [None]:
wnw_mdf = df_wnw.melt(
    id_vars=['code_name', 'epoch'],
    value_vars=['word_acc', 'nonword_acc'],
    var_name='wnw',
    value_name='acc'
)

plot_epoch = alt.Chart(wnw_mdf).mark_point(size=80).encode(
    y=alt.Y("acc:Q", scale=alt.Scale(domain=(0, 1))),
    x="epoch:Q",
    color="code_name:N",
    shape="wnw:N",
    opacity=alt.condition(sel_run, alt.value(1), alt.value(0)),
    tooltip=["code_name", "epoch", "acc"],
).add_selection(sel_run).transform_filter(sel_run).properties(
    title="Plot word and nonword accuracy by epoch"
)

plot_wnw = alt.Chart(df_wnw).mark_line(point=True).encode(
    y=alt.Y("nonword_acc:Q", scale=alt.Scale(domain=(0, 1))),
    x=alt.X("word_acc:Q", scale=alt.Scale(domain=(0, 1))),
    color="code_name:N",
    opacity=alt.condition(sel_run, alt.value(1), alt.value(0)),
    tooltip=["code_name", "epoch", "word_acc", "nonword_acc"],
).add_selection(sel_run).properties(
    title="Word vs. Nonword accuracy at final time step"
)

plot_wnw_diag = plot_wnw + diagline

multi_plot = overview | (plot_epoch & plot_wnw_diag)
multi_plot.save(batch_output_dir + 'multi_runs.html')
multi_plot

## Save and shutdown

In [None]:
!jupyter nbconvert --output-dir=$batch_output_dir --to html batch.ipynb

# Manual hand pick experiment parallel run

In [None]:
# %load_ext lab_black
import os, itertools, json, multiprocessing
import pandas as pd
import altair as alt
import papermill as pm
from time import sleep
from meta import model_cfg, batch_cfgs_to_df, parse_batch_results, check_cfgs_params
from evaluate import make_df_wnw

batch_name = "O2P_batchsize_opt_test"

batch_output_dir = 'batch_eval/{}/'.format(batch_name)
os.makedirs(batch_output_dir, exist_ok=True)

# import random
# seeds = [int(random.random() * 1e5) for x in range(10)]

param_grid = {
    'optimizer': ['adam', 'sgd'],
    'batch_size': [1, 32, 128],
}

static_hpar = {
    'sample_name': 'jay',
    'rng_seed': 4321,
    'use_semantic': False,
    'input_dim': 119,
    'output_dim': 250,
    'use_attractor': False,
    'rnn_activation': 'sigmoid',
    'regularizer_const': None,
    'w_initializer': 'glorot_uniform',
    'tau': 0.2,
    'max_unit_time': 4.,
    'n_mil_sample': 1.,
    'save_freq': 10,
    'bq_dataset': None,
    'p_noise': 0.,
    'hidden_units': 100,
    'learning_rate': .008,
    'cleanup_units': 10
}

In [None]:
# Iterate and create batch level super object: batch_cfgs
batch_cfgs = []
varying_hpar_names, varying_hpar_values = zip(*param_grid.items())
for i, v in enumerate(itertools.product(*varying_hpar_values)):
    code_name = batch_name + "_r{:04d}".format(i)

    this_hpar = dict(zip(varying_hpar_names, v))
    this_hpar.update(static_hpar)

    # Add identifier params into param dict
    this_hpar['code_name'] = code_name

    # Pass into model_cfg to catch error early
    model_cfg(**this_hpar)

    batch_cfg = dict(
        sn=i,
        in_notebook="OSP_master.ipynb",
        code_name=code_name,
        model_folder="models/" + code_name + "/",
        out_notebook="models/" + code_name + "/output.ipynb",
        params=this_hpar
    )

    batch_cfgs.append(batch_cfg)

# Save cfgs
cfgs = batch_cfgs_to_df(batch_cfgs)
cfgs.to_csv(batch_output_dir + 'cfgs.csv')

In [None]:
# Run
def run_batch(cfg):
    """
    Using papermill to run parameterized notebook
    """
    print("Running model {}".format(cfg['sn']))
    os.makedirs(cfg['model_folder'], exist_ok=True)
    pm.execute_notebook(
        cfg['in_notebook'],
        cfg['out_notebook'],
        parameters=cfg['params'],
    )


# Run in parallel pool
with multiprocessing.Pool(4) as pool:
    pool.map(run_batch, batch_cfgs)

In [None]:
df = parse_batch_results(cfgs)
df.to_csv(batch_output_dir + 'bcdf.csv')

In [None]:
send_mail(batch_name)
sleep(30)
!sudo poweroff  