# Individual differences (part 3)

In [None]:
%load_ext lab_black
import pandas as pd
import numpy as np
import altair as alt
import helper
from altair.expr import datum

alt.data_transformers.disable_max_rows()

### Load new (Sept 30, 2020) combined and even grid dataset 525

In [None]:
df_all = helper.parse_from_file("../sims/1520_sims.csv")
hpar = ["hidden_units", "cleanup_units", "p_noise", "learning_rate"]

In [None]:
baseline = helper.Select_Model(df_all)
baseline.select_by_control(
    hidden_units=[100, 150, 200], p_noise=[1, 2, 3], learning_rate=[0.004, 0.006, 0.008]
)
baseline.plot("Baseline: Middle control space 3x3x3").save("baseline.html")

### Load Part3 Datafile (n=525)

In [None]:
df = helper.parse_from_file("../sims/part3_525.csv")
helper.count_grid(df, hpar).save("count_models_525.html")

### Mang

In [None]:
class Select_RD(helper.Select_Model):
    def __init__(
        self, df, td_df, include_conds=["HF_CON", "HF_INC", "LF_CON", "LF_INC"]
    ):
        self.include_conds = include_conds
        self.df = df
        self.td_df = td_df
        self.td_stat = self.get_stat()
        self.cadf = self.make_condition_averaged_df()
        self.mdf = self.melt_df(self.cadf)

    def get_stat(self):
        """Baseline statistics
        Return mean and sd by epoch in word 
        """
        return (
            self.td_df.loc[self.td_df.cond.isin(self.include_conds),]
            .groupby(["code_name", "epoch"])
            .mean()
            .reset_index()
            .groupby(["epoch"])
            .agg(["mean", "std"])
            .score.reset_index()
        ).to_dict()

    def make_condition_averaged_df(self):
        """Make a df that has z-score (Aggregate cond, select words only)
        """

        cadf = (
            self.df.loc[self.df.cond.isin(self.include_conds),]
            .groupby(["code_name", "epoch"])
            .mean()
            .reset_index()
        )
        cadf["learning_rate"] = round(cadf.learning_rate, 4)
        cadf["epoch_idx"] = cadf.apply(
            lambda x: x.epoch * 100 if x.epoch <= 0.1 else x.epoch * 10 + 9, axis=1
        )
        cadf["epoch_idx"] = cadf.epoch_idx.astype(int)
        cadf["z_deviance"] = cadf.apply(self.calcuate_z_deviance, axis=1)

        # Different cutoff of RDs (0 = RD, 1 = TD)
        cadf["group_10"] = 1 * (cadf.z_deviance > -1.0)
        cadf["group_11"] = 1 * (cadf.z_deviance > -1.1)
        cadf["group_12"] = 1 * (cadf.z_deviance > -1.2)
        cadf["group_13"] = 1 * (cadf.z_deviance > -1.3)
        cadf["group_14"] = 1 * (cadf.z_deviance > -1.4)
        cadf["group_15"] = 1 * (cadf.z_deviance > -1.5)
        cadf["group_16"] = 1 * (cadf.z_deviance > -1.6)
        cadf["group_17"] = 1 * (cadf.z_deviance > -1.7)
        cadf["group_18"] = 1 * (cadf.z_deviance > -1.8)
        cadf["group_19"] = 1 * (cadf.z_deviance > -1.9)
        cadf["group_20"] = 1 * (cadf.z_deviance > -2.0)
        cadf["group_21"] = 1 * (cadf.z_deviance > -2.1)
        cadf["group_22"] = 1 * (cadf.z_deviance > -2.2)
        cadf["group_23"] = 1 * (cadf.z_deviance > -2.3)
        cadf["group_24"] = 1 * (cadf.z_deviance > -2.4)
        cadf["group_25"] = 1 * (cadf.z_deviance > -2.5)
        cadf["group_26"] = 1 * (cadf.z_deviance > -2.6)
        cadf["group_27"] = 1 * (cadf.z_deviance > -2.7)
        cadf["group_28"] = 1 * (cadf.z_deviance > -2.8)
        cadf["group_29"] = 1 * (cadf.z_deviance > -2.9)
        cadf["group_30"] = 1 * (cadf.z_deviance > -3.0)

        return self.reduce_epoch_resolution(cadf)

    def plot_heatmap(self, var):
        """Z-score deviance over epoch"""
        if var == "z_deviance":
            domain = (-5, 5)
        else:
            domain = (0, 1)

        mean_var = f"mean({var})"

        hm = (
            alt.Chart(self.cadf)
            .mark_rect()
            .encode(
                x="p_noise:O",
                y=alt.Y("hidden_units:O", sort="descending"),
                row=alt.Column("learning_rate:O", sort="descending"),
                column="epoch:O",
                color=alt.Color(
                    mean_var, scale=alt.Scale(domain=domain, scheme="redyellowgreen"),
                ),
                tooltip=["mean(z_deviance)", "mean(score)"],
            )
        )

        return hm

    def reduce_epoch_resolution(self, df):
        sel_epoch = [0.01, 0.03, 0.05, 0.07, 0.09, 0.2, 0.4, 0.6, 0.8, 1.0]
        return df.loc[
            df.epoch.isin(sel_epoch),
        ]

    def melt_df(self, df):

        mdf = df.melt(
            id_vars=[
                "code_name",
                "epoch",
                "hidden_units",
                "cleanup_units",
                "p_noise",
                "learning_rate",
            ],
            value_vars=[
                "group_10",
                "group_11",
                "group_12",
                "group_13",
                "group_14",
                "group_15",
                "group_16",
                "group_17",
                "group_18",
                "group_19",
                "group_20",
                "group_21",
                "group_22",
                "group_23",
                "group_24",
                "group_25",
                "group_26",
                "group_27",
                "group_28",
                "group_29",
                "group_30",
            ],
        )

        mdf["cutoff"] = mdf.variable.str[-2:].astype(float) / 10

        return mdf

    def calcuate_z_deviance(self, row):
        """Calcuate z score relative to TD at each epoch
        """
        m = self.td_stat["mean"][row["epoch_idx"]]
        sd = self.td_stat["std"][row["epoch_idx"]]

        # Avoid zero division
        if sd == 0:
            sd = 1e-6

        return (row["score"] - m) / sd

    def get_acc_cut(self, epoch, xsd, cond=None):
        """Get accuracy cut off value with reference to xsd below mean of TD
        td_df: data file of typically developing readers (created by Select_Model().df)
        epoch: at what epoch to classify RD [list]
        xsd: how many sd below mean of TD
        cond: include what condition, default = all conditions (no filtering)
        """

        sel = (
            self.td_df.loc[self.td_df.epoch.isin(epoch)]
            if (cond is None)
            else self.td_df.loc[
                self.td_df.epoch.isin(epoch) & self.td_df.cond.isin(cond)
            ]
        )

        stat = sel.groupby("code_name").mean().score.agg(["mean", "std"])
        return stat["mean"] - xsd * stat["std"]

    def select_by_relative_sd(self, epoch, xsd, cond=None):
        """Select the models that has at least
        X SD <xsd> below mean of TD at <epoch>"""

        tmp = (
            self.df.loc[self.df.epoch.isin(epoch)]
            if (cond is None)
            else self.df.loc[self.df.epoch.isin(epoch) & self.df.cond.isin(cond)]
        )

        mean_tmp = tmp.groupby("code_name").mean().reset_index()
        sel = mean_tmp.loc[mean_tmp.score < self.get_acc_cut(epoch, xsd, cond)]
        self.df = self.df.loc[self.df.code_name.isin(sel["code_name"])]

        # Make deviance
        self.cadf = self.make_condition_averaged_df()

    def plot_bundle(self, baseline_model):
        """ Plot all with baseline model as reference group
        """

        dev_cond = (
            self.plot_mean_development(show_sd=False)
            + baseline_model.plot_mean_development(show_sd=True)
        ).properties(title="Each condition")

        dev_mean = (
            self.plot_all_cond_mean(show_sd=False)
            + baseline_model.plot_all_cond_mean(show_sd=True)
        ).properties(title="Mean in all conditions")

        wnw_mean = (
            self.plot_wnw(mean=True) + baseline_model.plot_wnw(mean=True)
        ).properties(title="Word vs. NW")

        return (
            self.plot_control_space()
            & (dev_cond | dev_mean | wnw_mean)
            & self.plot_heatmap("z_deviance")
        ).properties(title=self.stat_header())

    def plot_interactive_group_heatmap(self):

        df = (
            self.mdf.groupby(
                ["hidden_units", "p_noise", "learning_rate", "epoch", "cutoff"]
            )
            .mean()
            .reset_index()
        )

        slider = alt.binding_range(min=1.0, max=3.0, step=0.1, name="cutoff:")
        selector = alt.selection_single(
            name="SelectorName", fields=["cutoff"], bind=slider, init={"cutoff": 1.0}
        )

        interactive_group_heatmap = (
            alt.Chart(df)
            .mark_rect()
            .encode(
                x="p_noise:O",
                y=alt.Y("hidden_units:O", sort="descending"),
                row=alt.Column("learning_rate:O", sort="descending"),
                column="epoch:O",
                color=alt.Color(
                    "value", scale=alt.Scale(domain=(0, 1), scheme="redyellowgreen"),
                ),
            )
            .add_selection(selector)
            .transform_filter(selector)
        )

        return interactive_group_heatmap

### Instantiate RD analysis class

In [None]:
rd_all_cond = Select_RD(df, baseline.df, include_conds=df.cond.unique())
rd_word = Select_RD(
    df, baseline.df, include_conds=["HF_INC", "LF_INC", "HF_CON", "LF_CON"]
)
rd_hfinc = Select_RD(df, baseline.df, include_conds=["HF_INC"])

### Interactive group heatmap

In [None]:
rd_all_cond.plot_interactive_group_heatmap().save(
    "interative_group_heatmap_all_conds.html"
)
rd_word.plot_interactive_group_heatmap().save("interative_group_heatmap_all_words.html")
rd_hfinc.plot_interactive_group_heatmap().save("interative_group_heatmap_hfinc.html")

### Other static heatmaps

In [None]:
rd_all_cond.plot_heatmap("score").save("heatmap_raw_all_conds.html")
rd_all_cond.plot_heatmap("z_deviance").save("heatmap_z_all_conds.html")

rd_word.plot_heatmap("score").save("heatmap_raw_all_words.html")
rd_word.plot_heatmap("z_deviance").save("heatmap_z_all_words.html")

rd_hfinc.plot_heatmap("score").save("heatmap_raw_hfinc.html")
rd_hfinc.plot_heatmap("z_deviance").save("heatmap_z_hfinc.html")

### Old interactive plot

In [None]:
variates = ["hidden_units", "p_noise", "learning_rate"]

df_wnw = df.loc[
    (df.cond.isin(["HF_INC", "NW_UN"])),
    variates + ["code_name", "epoch", "cond", "score"],
]

df_wnw = df_wnw.pivot_table(
    index=variates + ["epoch", "code_name"], columns="cond"
).reset_index()

df_wnw.columns = df_wnw.columns = ["".join(c).strip() for c in df_wnw.columns.values]
df_wnw.rename(
    columns={"scoreHF_INC": "word_acc", "scoreNW_UN": "nonword_acc",}, inplace=True,
)

df_wnw["word_advantage"] = df_wnw.word_acc - df_wnw.nonword_acc
df_wnw

In [None]:
select_control_space = alt.selection(
    type="multi",
    on="click",
    empty="none",
    fields=["code_name"],
    init=[{"code_name": "n0_h100_l0.01"}],
)

# Control space
df_overview = df_wnw.loc[df_wnw.epoch == df_wnw.epoch.max()]

control_space = (
    alt.Chart(df_overview)
    .mark_rect()
    .encode(
        x="p_noise:O",
        y=alt.Y("hidden_units:O", sort="descending"),
        column=alt.Column("learning_rate:O", sort="descending"),
        color=alt.Color(
            "word_acc", scale=alt.Scale(scheme="redyellowgreen", domain=(0, 1))
        ),
        opacity=alt.condition(select_control_space, alt.value(1), alt.value(0.3)),
        tooltip=["code_name", "word_acc", "nonword_acc", "word_advantage"],
    )
    .add_selection(select_control_space)
    .properties(title="Select a control parameter setting:")
)

# Development space
df.sort_values(by=["code_name", "cond"], inplace=True)

development_space = (
    alt.Chart(df)
    .mark_line()
    .encode(
        y=alt.Y("score:Q", scale=alt.Scale(domain=(0, 1))),
        x="epoch:Q",
        color="cond:N",
        tooltip=["code_name", "epoch", "score"],
    )
    .transform_filter(select_control_space)
    .properties(title="Developmental space: Accuracy in each condition over epoch")
)

# Performance space
wnw_line = (
    alt.Chart(df_wnw)
    .mark_line(color="black")
    .encode(
        y=alt.Y("nonword_acc:Q", scale=alt.Scale(domain=(0, 1))),
        x=alt.X("word_acc:Q", scale=alt.Scale(domain=(0, 1))),
        tooltip=["code_name", "epoch", "word_acc", "nonword_acc"],
    )
    .transform_filter(select_control_space)
)

diagonal = (
    alt.Chart(pd.DataFrame({"x": [0, 1], "y": [0, 1]}))
    .mark_line(color="#D3D3D3")
    .encode(
        x=alt.X("x", axis=alt.Axis(title="word")),
        y=alt.X("y", axis=alt.Axis(title="nonword")),
    )
)

performance_space = (diagonal + wnw_line).properties(
    title="Performance space: Nonword accuracy vs. Word accuracy"
)

dev_heat = alt.Chart()


# Merge dashboard
dashboard = control_space & (development_space | performance_space)
dashboard.save("dashboard.html")