# Examine Rueckl19 dataset

In [None]:
%load_ext lab_black
import pandas as pd
import altair as alt
alt.data_transformers.enable("default")
alt.data_transformers.disable_max_rows()

### Ingest, tidy

In [None]:
df.columns

In [None]:
df = pd.read_csv('plotdf.csv', index_col=0)
df['word_acc'] = df[[
    'HF_CON_Accuracy', 'HF_INC_Accuracy', 'LF_CON_Accuracy', 'LF_INC_Accuracy'
]].mean(axis=1)
df['nonword_acc'] = df[['NW_AMB_Accuracy', 'NW_UN_Accuracy']].mean(axis=1)

df.rename(
    columns={
        'ID': 'code_name',
        'Trial.Scaled': 'epoch',  # Trial scaled renamed to Epoch onward
        'Pnoise': 'p_noise',  # group renamed to code_name onward
        'Hidden': 'hidden_units',
        'Epsilon': 'learning_rate',
        'PhoHid': 'cleanup_units'
    },
    inplace=True
)

df = df[[
    'code_name', 'epoch', 'hidden_units', 'cleanup_units', 'p_noise',
    'learning_rate', 'word_acc', 'nonword_acc'
]]

### Are there more than one model in a unique set of setting? Yes...

In [None]:
def count_models(df):

    dfm = df[[
        'code_name', 'p_noise', 'hidden_units', 'learning_rate', 'cleanup_units'
    ]].pivot_table(index='code_name')
    dfm['code_name'] = dfm.index

    pvt = dfm.pivot_table(
        index=['p_noise', 'hidden_units', 'learning_rate', 'cleanup_units'],
        aggfunc='count',
        values='code_name'
    )

    pvt.reset_index(inplace=True)
    pvt.rename(columns={'code_name': 'n'}, inplace=True)

    plot_n = alt.Chart(pvt).mark_rect().encode(
        x="p_noise:O",
        y="hidden_units:O",
        row="learning_rate:O",
        column="cleanup_units:O",
        color="n:O",
        tooltip=[
            "p_noise", "hidden_units", "cleanup_units", "learning_rate", "n"
        ],
    ).properties(title="Model counts")

    return plot_n

plot_n = count_models(df)
plot_n.save('count_model_all.html')

# Main plot word vs. nonword

- To reuse my pipeline as much as possible, I just select one model if there multiple runs with one setting

In [None]:
def main_dashboard(df):

    sel_run = alt.selection(type="multi", on="click", fields=["code_name"])

    # df for overview
    df_ov = df[df.epoch == df.epoch.max()]

    # Shared master over-view
    overview = (
        alt.Chart(df_ov).mark_rect().encode(
            x="p_noise:O",
            y="hidden_units:O",
            row="learning_rate:O",
            column="cleanup_units:O",
            color=alt.Color(
                "word_acc",
                scale=alt.Scale(scheme="redyellowgreen", domain=(0, 1))
            ),
            opacity=alt.condition(sel_run, alt.value(1), alt.value(0.1)),
            tooltip=[
                "code_name", "p_noise", "hidden_units", "cleanup_units",
                "learning_rate", "word_acc", "nonword_acc"
            ],
        ).add_selection(sel_run).properties(title="Overall accuracy")
    )

    wnw_mdf = df.melt(
        id_vars=['code_name', 'epoch'],
        value_vars=['word_acc', 'nonword_acc'],
        var_name='wnw',
        value_name='acc'
    )

    plot_epoch = alt.Chart(wnw_mdf).mark_point(size=80).encode(
        y=alt.Y("acc:Q", scale=alt.Scale(domain=(0, 1))),
        x="epoch:Q",
        color="code_name:N",
        shape="wnw:N",
        opacity=alt.condition(sel_run, alt.value(1), alt.value(0)),
        tooltip=["code_name", "epoch", "acc"],
    ).add_selection(sel_run).properties(
        title="Plot word and nonword accuracy by epoch"
    )

    wnw_line = alt.Chart(df).mark_line(point=True).encode(
        y=alt.Y("nonword_acc:Q", scale=alt.Scale(domain=(0, 1))),
        x=alt.X("word_acc:Q", scale=alt.Scale(domain=(0, 1))),
        color="code_name:N",
        opacity=alt.condition(sel_run, alt.value(1), alt.value(0)),
        tooltip=["code_name", "epoch", "word_acc", "nonword_acc"],
    ).add_selection(sel_run).properties(
        title="Word vs. Nonword accuracy at final time step"
    )

    diagonal = alt.Chart(pd.DataFrame({
        'x': [0, 1],
        'y': [0, 1]
    })).mark_line(color='black').encode(x='x', y='y')

    return overview | (plot_epoch & (diagonal + wnw_line))


main_plot = main_dashboard(df)
main_plot.save('dashboard_all.html')

# Single dimension main effect plots

In [None]:
def main_effect_plot(df, var):

    pdf = df.pivot_table(
        index=['epoch', var], values=['word_acc', 'nonword_acc']
    )
    pdf.reset_index(inplace=True)
    pdf['word_advantage'] = pdf.word_acc - pdf.nonword_acc

    sel_var = alt.selection(type="multi", on="click", fields=[var])

    overview_bias = (
        alt.Chart(pdf).mark_rect().encode(
            y=alt.Y(var, type="ordinal"),
            x='epoch:O',
            color=alt.Color(
                "word_advantage",
                scale=alt.Scale(scheme="redyellowgreen", domain=(-.2, .2))
            ),
            opacity=alt.condition(sel_var, alt.value(1), alt.value(0.1)),
            tooltip=["word_acc", "nonword_acc"],
        ).add_selection(sel_var).properties(
            title=
            "Word - Nonword accuracy (word_advantage) heatmap by {} and epoch".
            format(var)
        )
    )

    overview_wacc = (
        alt.Chart(pdf).mark_rect().encode(
            y=alt.Y(var, type="ordinal"),
            x='epoch:O',
            color=alt.Color(
                "word_acc",
                scale=alt.Scale(scheme="redyellowgreen", domain=(0, 1))
            ),
            opacity=alt.condition(sel_var, alt.value(1), alt.value(0.1)),
            tooltip=["word_acc", "nonword_acc"],
        ).add_selection(sel_var).properties(
            title=
            "Word - Nonword accuracy (word_advantage) heatmap by {} and epoch".
            format(var)
        )
    )

    wnw_line = alt.Chart(pdf).mark_line().encode(
        y=alt.Y("nonword_acc:Q", scale=alt.Scale(domain=(0, 1))),
        x=alt.X("word_acc:Q", scale=alt.Scale(domain=(0, 1))),
        color=alt.Color(var, type="ordinal", scale=alt.Scale(scheme="magma")),
        opacity=alt.condition(sel_var, alt.value(0.9), alt.value(0)),
        tooltip=[var, "epoch", "word_acc", "nonword_acc"],
    )

    diagonal = alt.Chart(pd.DataFrame({
        'x': [0, 1],
        'y': [0, 1]
    })).mark_line(color='black').encode(x='x', y='y')

    wnw = diagonal + wnw_line

    return overview_wacc | overview_bias | wnw


p = alt.vconcat()
for v in ['hidden_units', 'cleanup_units', 'p_noise', 'learning_rate']:
    p &= main_effect_plot(df, v)

p.save('main_effect_all.html')

# (Somewhat) Direct comparison with my data

- Perhaps can compare: hidden units, cleanup units
- probably not comparable: lr, p-noise
- setup an even grid for easier comparision

In [None]:
sdf = df.loc[df.learning_rate.isin([.002, .004, .006, .008, .010]) &
             (df.p_noise < 6) & df.cleanup_units.isin([10, 50]) &
             df.hidden_units.isin([50, 100, 150, 200]), ]

In [None]:
psdf = sdf.pivot_table(index='code_name').reset_index()
sel_code = psdf.groupby(
    ['hidden_units', 'cleanup_units', 'p_noise', 'learning_rate'],
    group_keys=False
).apply(lambda df: df.sample(1)).code_name

ssdf = sdf.loc[sdf.code_name.isin(sel_code)]
plot_n_sel = count_models(ssdf)
plot_n_sel.save('count_model_sel.html')

### Plot dashboard and main effect plot

In [None]:
plot_main_sel = main_dashboard(ssdf)
plot_main_sel.save('dashboard_sel.html')

In [None]:
p = alt.vconcat()
for v in ['hidden_units', 'p_noise', 'learning_rate']:
    p &= main_effect_plot(ssdf, v)

p.save('main_effect_sel.html')