# Individual differences simulation paper part III

In [None]:
%load_ext lab_black
import pandas as pd
import altair as alt
from altair.expr import datum
import numpy as np
from IPython.display import clear_output

alt.data_transformers.disable_max_rows()

### Notes:
- Change heatmap directions (Top, Left always leads to better performance)
- Add W-NW heatmap
- Use line type to indicate 2x2 structure in performance space
- Add origin

### Load new (Aug 27, 2020) combined dataset 1520

In [None]:
df = pd.read_csv("1520_sims.csv", index_col=0)

df.rename(
    columns={
        "ID": "code_name",
        "Trial.Scaled": "epoch",
        "Hidden": "hidden_units",
        "PhoHid": "cleanup_units",
        "Pnoise": "p_noise",
        "Epsilon": "learning_rate",
        "Type": "cond",
        "Measure": "measure",
        "Score": "score",
        "Freq": "cond_freq",
        "Cons": "cond_cons",
    },
    inplace=True,
)

df = df.loc[df.measure == "Accuracy"]

In [None]:
def add_origin(df):
    """Add origin data point in each model"""

    if df.epoch.min() > 0:
        # Borrow epoch == 1.0 as a frame for epoch = 0
        tmp = df.loc[df.epoch == 1.0,].copy()
        tmp.score = 0
        tmp.epoch = 0
        df_with_origin = pd.concat([df, tmp], ignore_index=True)
        return df_with_origin.sort_values(
            by=["code_name", "cond", "epoch"]
        ).reset_index(drop=True)

    else:
        print("Already have origin, returning original df")
        return df

In [None]:
df = add_origin(df)

### Count model in h-grid

In [None]:
def count_grid(df, hpar):
    """Counting how many runs in each h-param cell 
    """

    settings = df[["code_name"] + hpar].pivot_table(index="code_name")
    settings["code_name"] = settings.index
    settings["learning_rate"] = settings.learning_rate.round(4)

    count_settings = settings.pivot_table(
        index=hpar, aggfunc="count", values="code_name",
    )
    count_settings.reset_index(inplace=True)
    count_settings.rename(columns={"code_name": "n"}, inplace=True)

    return (
        alt.Chart(count_settings)
        .mark_rect()
        .encode(
            x="p_noise:O",
            y=alt.Y("hidden_units:O", sort="descending"),
            row=alt.Row("learning_rate:O", sort="descending"),
            column=alt.Column("cleanup_units:O", sort="descending"),
            color="n:O",
            tooltip=hpar + ["n"],
        )
        .properties(title="Model counts")
    )


hpar = ["hidden_units", "cleanup_units", "p_noise", "learning_rate"]
count_grid(df, hpar).save("count_models.html")

### Subset to 20 cleanup units and aggregrate within each h-param cell (sdf)

In [None]:
# filter cleanup == 20
sdf = df.loc[
    df.cleanup_units == 20,
]


# Cell specific code_name
sdf = sdf.groupby(
    ["epoch", "p_noise", "hidden_units", "learning_rate", "cond"], as_index=False
).mean()
sdf.drop(columns=["code_name", "cleanup_units"], inplace=True)

sdf["code_name"] = sdf.agg(
    lambda x: f'n{x["p_noise"]}_h{x["hidden_units"]}_l{x["learning_rate"]}', axis=1
)

# Word vs. Nonword label
sdf["type"] = sdf.cond.apply(
    lambda x: "word" if x in ["HF_CON", "HF_INC", "LF_CON", "LF_INC"] else "nonword"
)


sdf.sample(5)

### Last epoch accuracy in mean word and nonword

In [None]:
def plot_type_acc(df, type_name):

    plot_df = df.loc[
        (df.epoch == 1.0) & (df.type == type_name),
    ]

    return (
        alt.Chart(plot_df)
        .mark_rect()
        .encode(
            x="p_noise:O",
            y=alt.Y("hidden_units:O", sort="descending"),
            column=alt.Column("learning_rate:O", sort="descending"),
            color=alt.Color(
                "score", scale=alt.Scale(scheme="redyellowgreen", domain=(0, 1))
            ),
            tooltip=["score"],
        )
        .properties(title=f"{type_name} accuracy at the end of training")
    )

In [None]:
plot_type_acc(sdf, "word")

In [None]:
plot_type_acc(sdf, "nonword")

### Parse df_wnw (widen)

In [None]:
variates = ["hidden_units", "p_noise", "learning_rate"]

df_wnw = sdf.loc[
    (sdf.cond.isin(["HF_INC", "NW_UN"])),
    variates + ["code_name", "epoch", "cond", "score"],
]

df_wnw = df_wnw.pivot_table(
    index=variates + ["epoch", "code_name"], columns="cond"
).reset_index()

df_wnw.columns = df_wnw.columns = ["".join(c).strip() for c in df_wnw.columns.values]
df_wnw.rename(
    columns={"scoreHF_INC": "word_acc", "scoreNW_UN": "nonword_acc",}, inplace=True,
)

df_wnw["word_advantage"] = df_wnw.word_acc - df_wnw.nonword_acc
df_wnw

### Mini dashboard

In [None]:
select_control_space = alt.selection(
    type="multi",
    on="click",
    empty="none",
    fields=["code_name"],
    init=[{"code_name": "n0_h100_l0.01"}],
)

# Control space
df_overview = df_wnw.loc[df_wnw.epoch == df_wnw.epoch.max()]

control_space = (
    alt.Chart(df_overview)
    .mark_rect()
    .encode(
        x="p_noise:O",
        y=alt.Y("hidden_units:O", sort="descending"),
        column=alt.Column("learning_rate:O", sort="descending"),
        color=alt.Color(
            "word_acc", scale=alt.Scale(scheme="redyellowgreen", domain=(0, 1))
        ),
        opacity=alt.condition(select_control_space, alt.value(1), alt.value(0.3)),
        tooltip=["code_name", "word_acc", "nonword_acc", "word_advantage"],
    )
    .add_selection(select_control_space)
    .properties(title="Select a control parameter setting:")
)

# Development space
sdf.sort_values(by=["code_name", "cond"], inplace=True)

development_space = (
    alt.Chart(sdf)
    .mark_line()
    .encode(
        y=alt.Y("score:Q", scale=alt.Scale(domain=(0, 1))),
        x="epoch:Q",
        color="cond:N",
        tooltip=["code_name", "epoch", "score"],
    )
    .transform_filter(select_control_space)
    .properties(title="Developmental space: Accuracy in each condition over epoch")
)

# Performance space
wnw_line = (
    alt.Chart(df_wnw)
    .mark_line(color="black")
    .encode(
        y=alt.Y("nonword_acc:Q", scale=alt.Scale(domain=(0, 1))),
        x=alt.X("word_acc:Q", scale=alt.Scale(domain=(0, 1))),
        tooltip=["code_name", "epoch", "word_acc", "nonword_acc"],
    )
    .transform_filter(select_control_space)
)

diagonal = (
    alt.Chart(pd.DataFrame({"x": [0, 1], "y": [0, 1]}))
    .mark_line(color="#D3D3D3")
    .encode(
        x=alt.X("x", axis=alt.Axis(title="word")),
        y=alt.X("y", axis=alt.Axis(title="nonword")),
    )
)

performance_space = (diagonal + wnw_line).properties(
    title="Performance space: Nonword accuracy vs. Word accuracy"
)


# Merge dashboard
dashboard = control_space & (development_space | performance_space)
dashboard.save("dashboard.html")

# Dyslexia: Heterogeneity in cause

In [None]:
from altair.expr import datum

df_corners = df_wnw.loc[
    df_wnw.learning_rate.isin([df_wnw.learning_rate.max(), df_wnw.learning_rate.min()])
    & df_wnw.hidden_units.isin([df_wnw.hidden_units.max(), df_wnw.hidden_units.min()])
    & df_wnw.p_noise.isin([df_wnw.p_noise.max(), df_wnw.p_noise.min()])
].copy()


df_corners["h_group"] = df_corners.hidden_units.apply(
    lambda x: "high" if x == df_corners.hidden_units.max() else "low"
)

df_corners["p_group"] = df_corners.p_noise.apply(
    lambda x: "high" if x == df_corners.p_noise.max() else "low"
)

df_corners["condition_hp"] = df_corners.agg(
    lambda x: f"{x.p_group} noise, {x.h_group} hidden", axis=1
)

df_corners.sample(10)

### Control parameter in extreme corner without smoothing

In [None]:
def easy_plot(df, x, y, smooth=False):
    """ Easy plot for part III analysis
    Panel column = Learning rate
    df: must be in wide format, with word_acc and nonword_acc column
    x: x-axis variable in altair format
    y: y-axis variable in altair format
    """

    base = (
        alt.Chart(df)
        .mark_line()
        .encode(
            x=alt.X(x, scale=alt.Scale(domain=(0, 1))),
            y=alt.Y(y, scale=alt.Scale(domain=(0, 1))),
            order=["epoch"],
            color="h_group:N",
            strokeDash="p_group:N",
            tooltip=["code_name", "epoch", "word_acc", "nonword_acc"],
        )
    )

    plot = alt.vconcat()

    for lr in df.learning_rate.unique():
        plot |= base.transform_filter(datum.learning_rate == lr).properties(
            title=[
                "High learning rate"
                if lr == df.learning_rate.max()
                else "Low learning rate"
            ]
        )

    return plot

In [None]:
corner_performance = easy_plot(df=df_corners, x="word_acc:Q", y="nonword_acc:Q")
corner_performance.save("corner_performance.html")

### Developmental space 

#### Words (HF_INC)

In [None]:
corner_dev_w = easy_plot(df=df_corners, x="epoch:Q", y="word_acc:Q").properties(
    title="Word accuracy (HF_INC)"
)
corner_dev_w
corner_dev_w.save("corner_dev_word.html")

#### Nonwords (NW_UN)

In [None]:
corner_dev_nw = easy_plot(df=df_corners, x="epoch:Q", y="nonword_acc:Q").properties(
    title="Nonword accuracy (NW_UN)"
)
corner_dev_nw
corner_dev_nw.save("corner_dev_nonword.html")

# Word advantage heatmap

In [None]:
word_advantage_over_epoch = (
    alt.Chart(df_wnw)
    .mark_rect()
    .encode(
        x="p_noise:O",
        y=alt.Y("hidden_units:O", sort="descending"),
        row=alt.Column("learning_rate", sort="descending"),
        column="epoch:O",
        color=alt.Color(
            "word_advantage",
            scale=alt.Scale(scheme="redyellowgreen", domain=(-0.3, 0.3)),
            title="word advantage (W-NW)",
        ),
        tooltip=["word_acc", "nonword_acc", "word_advantage"],
    )
)

word_advantage_over_epoch.save("word_advantage.html")
word_advantage_over_epoch

# Dyslexia: Heterogeneity in consequence

#### Model level performance grouping 0-25, 25-75, 75-100 percentile

In [None]:
gacc = df.groupby("code_name", as_index=False).mean()
gacc = gacc[["code_name", "score"]]
gacc["rank_pc"] = gacc.score.rank(pct=True)
gacc["group"] = gacc.rank_pc.map(
    lambda x: "High" if x > 0.75 else ("Mid" if x > 0.25 else "Low")
)

df_group = df.merge(gacc[["code_name", "group"]], how="left")
df_group_mean = df_group.groupby(["group", "epoch", "cond"], as_index=False).mean()

#### Developmental plot for details

In [None]:
plot_group_dev = alt.hconcat()

base = (
    alt.Chart(df_group_mean)
    .mark_line()
    .encode(
        x=alt.X("epoch:Q", scale=alt.Scale(domain=(0, 1))),
        y=alt.Y("score:Q", scale=alt.Scale(domain=(0, 1))),
        color="group:N",
    )
)
row = alt.vconcat()

for i, x in enumerate(df_group_mean.cond.unique()):
    row |= base.transform_filter(datum.cond == x).properties(title=x)
    # Reset and glue row
    if (i + 1) % 2 == 0:
        plot_group_dev &= row
        row = alt.vconcat()

In [None]:
plot_group_dev.save("group_dev_all.html")

### Facier plot with SD band in word and nonword

In [None]:
line = (
    alt.Chart(df_group)
    .mark_line()
    .encode(
        x=alt.X("epoch:Q", scale=alt.Scale(domain=(0, 1))),
        y=alt.Y("mean(score):Q", scale=alt.Scale(domain=(0, 1))),
        color="group:N",
    )
)

band = (
    alt.Chart(df_group)
    .mark_errorband(extent="stdev")
    .encode(
        x=alt.X("epoch:Q", scale=alt.Scale(domain=(0, 1))),
        y=alt.Y("score:Q", scale=alt.Scale(domain=(0, 1))),
        color="group:N",
    )
)

base = line + band

plot = alt.hconcat()

for x in ["HF_INC", "NW_UN"]:

    plot |= base.transform_filter(datum.cond == x).properties(
        title=["Word (HF_INC)" if x == "HF_INC" else "Nonword (NW_UN)"]
    )


plot.save("group_dev_wnw.html")
plot

#### Mean plot for overall picture

In [None]:
df_wnw_group = df_group.pivot_table(
    index=["group", "epoch", "code_name"], columns="cond", values="score"
).reset_index()

df_wnw_group_mean = df_group.pivot_table(
    index=["group", "epoch"], columns="cond", values="score"
).reset_index()

line = (
    alt.Chart(df_wnw_group_mean)
    .mark_line()
    .encode(
        x=alt.X("HF_INC", scale=alt.Scale(domain=(0, 1))),
        y=alt.Y("NW_UN", scale=alt.Scale(domain=(0, 1))),
        color="group",
    )
)

band = (
    alt.Chart(df_wnw_group)
    .mark_errorband(extent="ci")
    .encode(
        x=alt.X("HF_INC", scale=alt.Scale(domain=(0, 1))),
        y=alt.Y("NW_UN", scale=alt.Scale(domain=(0, 1))),
        color="group",
    )
)

line + band

In [None]:
line

In [None]:
def easy_plotwnw(df, group, sample=None):

    pdf = df.loc[
        df.group == group,
    ]

    # Random sampling by code_name
    if sample is not None:
        ids = np.random.choice(df_wnw_group.code_name.unique(), sample)
        pdf = pdf.loc[pdf.code_name.isin(ids)]

    # Plot

    return (
        alt.Chart(pdf)
        .mark_line()
        .encode(
            x=alt.X("HF_INC", scale=alt.Scale(domain=(0, 1))),
            y=alt.Y("NW_UN", scale=alt.Scale(domain=(0, 1))),
            color="group",
            detail="code_name",
            opacity=alt.value(0.9),
        )
    )


easy_plotwnw(df_wnw_group, "Low", 30) + easy_plotwnw(
    df_wnw_group, "Mid", 30
) + easy_plotwnw(df_wnw_group, "High", 30)

# Defining typically developing readers
- Since we know one control parameter can offset the detrimental effect from another control parameter
    - Same "good" behavior outcome is not equal to "typical" in some sense... 
    - i.e., need to differential TD and "compensated" DD
    - Which justify using Part I & II control space as one of the filtering criteria
- TD develop at a reasonable pace
    - Not too fast:  <Threshold_low at early epoch (t_low)
    - Not too slow:  >Threshold_hi at late epoch (t_hi)

### Step-by-step details
Done:

1. Load all part III data (df)
2. Add origin (just looks a little bit better)
3. Average within cell in control space (sdf)

New:

4. Select by criteria
5. Plot control space and developmental space (over epoch)

In [None]:
class Select_Model:
    """ Helper class for defining TD
    I: Selection:
    1. Control space filter (within_part1)
    2. Simple 2 points accuracy filter (acc < threshold_low at t_low & acc > threshold_hi at t_hi)
    
    II: Plotting:
    1. Where are the selected model in the control space
    2. How's their average performance
    """
    def __init__(
        self, df, threshold_low, threshold_hi, t_low, t_hi, conds, within_part1
    ):

        self.df = df
        self.threshold_low = threshold_low
        self.threshold_hi = threshold_hi
        self.t_low = t_low
        self.t_hi = t_hi
        self.conds = conds

        tmp = self.pivot_to_wide(self.df, self.t_low, self.t_hi, self.conds)

        # Selected models
        tmp = tmp.loc[(tmp.t_low < self.threshold_low) & (tmp.t_hi > self.threshold_hi)]
        tmp = (
            tmp.loc[(tmp.p_noise <= 4) & (tmp.hidden_units >= 50)]
            if within_part1
            else tmp
        )

        # Create full dataframe of selected models
        self.selected_df = (
            self.df.loc[self.df.code_name.isin(tmp.code_name)]
            .sort_values(by=["code_name", "cond"])
            .reset_index()
        )

    def pivot_to_wide(self, df, t_low, t_hi, conds):
        """ Create a pivot table of model's t_low and t_hi as column
        df: input datafile
        t_low: epoch used in applying threshold_low
        t_hi : epoch used in applying threshold_hi
        conds: average across these conditions
        """
        tmp = df.loc[(df.epoch.isin([t_low, t_hi])) & df.cond.isin(conds)]

        index_names = [
            "code_name",
            "hidden_units",
            "p_noise",
            "learning_rate",
        ]

        pvt = tmp.pivot_table(
            index=index_names, columns="epoch", values="score",
        ).reset_index()

        # Rename new columns
        pvt.columns = index_names + ["t_low", "t_hi"]
        return pvt

    def plot_control_space(self):
        """Plot selected models at control space"""

        control_space = (
            alt.Chart(self.selected_df)
            .mark_rect()
            .encode(
                x="p_noise:O",
                y=alt.Y("hidden_units:O", sort="descending"),
                column=alt.Column("learning_rate:O", sort="descending"),
            )
        )
        return control_space

    def plot_mean_development(self):
        """Plot the mean development of all selected models"""

        development_space_sd = (
            alt.Chart(self.selected_df)
            .mark_errorband(extent="stdev")
            .encode(
                y=alt.Y("score:Q", scale=alt.Scale(domain=(0, 1))),
                x="epoch:Q",
                color="cond:N",
            )
            .properties(
                title="Developmental space: Accuracy in each condition over epoch"
            )
        )

        development_space_mean = development_space_sd.mark_line().encode(
            y="mean(score):Q"
        )
        return development_space_mean + development_space_sd

    def plot(self):
        return self.plot_control_space() & self.plot_mean_development()

In [None]:
test = Select_Model(
    sdf,
    t_low=0.05,
    threshold_low=0.5,
    t_hi=0.8,
    threshold_hi=0.9,
    conds=["HF_INC", "LF_INC", "HF_CON", "LF_CON"],
    within_part1=True,
)
test.plot()