# Individual differences simulation paper part III

In [None]:
%load_ext lab_black
import helper
import pandas as pd
import numpy as np
import altair as alt
from altair.expr import datum

alt.data_transformers.disable_max_rows()
df = helper.parse_from_file("../sims/1520_sims.csv")

# Defining typically developing readers

### Goal: Somewhat meaningful TD (acknowledging it is arbitury)

### Stragegy:

1. Filtering on control space
    - Since we know one control parameter can offset the detrimental effect from another control parameter
    - "good" behavior outcome is not equal to "typical" in some cases... 
    - i.e., need to differential TD and "compensated" DD
    - Which justify using Part I & II control space as one of the filtering criteria
    - (follow up) n1250 - lowest level - highest level
    
2. Filtering on development (Accuracy based)
    - TD develop at a reasonable pace
    - Not too fast:  <Threshold_low at early epoch (t_low)
    - Not too slow:  >Threshold_hi at late epoch (t_hi)
3. Filtering on rank
    - Global mean all cond, all epoch
    - Max min rank within each class
    - Maybe start with smaller bin size... 
    
    
### Meeting notes (Sep 11, 2020)
- Not to average within cell. (OK)
- Table for descriptives of selected "TD" (OK)
- Not worse level in 1250 (try) (OK)
- Add plot (mean of all condition only (not epoch)) (OK)
- Control space descritives (OK)
- 3-4 solution per set (OK)



### Master Select Model Class

In [None]:
class Select_Model:
    """ Helper class for defining TD
    I: Selection:
    1. Control space filter
    2. Rank filter
    3. Accuracy filter (developmental)
    
    II: Plotting:
    1. Where are the selected model in the control space
    2. How's their average performance (in each cond / mean of all conds)
    3. Some basic descriptives in title
    """

    def __init__(self, df):
        self.df = df

    def count_model(self):
        return len(self.df.code_name.unique())

    # Selection related functions

    def select_by_performance(self, threshold_low, threshold_hi, t_low, t_hi):

        n_pre = self.count_model()
        tmp = self.pivot_to_wide(self.df, t_low, t_hi)
        # Selected models
        tmp = tmp.loc[(tmp.t_low < threshold_low) & (tmp.t_hi > threshold_hi)]

        # Create full dataframe of selected models
        self.df = (
            self.df.loc[self.df.code_name.isin(tmp.code_name)]
            .sort_values(by=["code_name", "cond", "epoch"])
            .reset_index()
        )

        n_post = self.count_model()
        print(f"Selected {n_post} models from the original {n_pre} models")

    def select_by_control(self, hidden_units=None, p_noise=None, learning_rate=None):

        n_pre = self.count_model()
        if hidden_units is not None:
            self.df = self.df.loc[self.df.hidden_units.isin(hidden_units)]
        if p_noise is not None:
            self.df = self.df.loc[self.df.p_noise.isin(p_noise)]
        if learning_rate is not None:
            self.df = self.df.loc[self.df.learning_rate.isin(learning_rate)]

        n_post = self.count_model()
        print(f"Selected {n_post} models from the original {n_pre} models")

    def select_by_rankpc(self, minpc, maxpc):
        n_pre = self.count_model()
        self.df = self.df.loc[(self.df.rank_pc >= minpc) & (self.df.rank_pc <= maxpc)]
        n_post = self.count_model()
        print(f"Selected {n_post} models from the original {n_pre} models")

    def select_by_cond(self, conds):
        n_pre = self.count_model()
        self.df = self.df.loc[self.df.cond.isin(conds)]
        n_post = self.count_model()
        print(f"Selected {n_post} models from the original {n_pre} models")

    # Descriptives related functions

    def get_rankpc_desc(self):
        desc = self.df.groupby("code_name").mean().reset_index().rank_pc.describe()
        return f"M:{desc['mean']:.3f} SD: {desc['std']:.3f} Min: {desc['min']:.3f} Max: {desc['max']:.3f}"

    def get_acc_desc(self):
        desc = self.df.groupby("code_name").mean().reset_index().score.describe()
        return f"M:{desc['mean']:.3f} SD: {desc['std']:.3f} Min: {desc['min']:.3f} Max: {desc['max']:.3f}"

    # Plotting related functions

    def pivot_to_wide(self, df, t_low, t_hi):
        """ Create a pivot table of model's t_low and t_hi as column
        df: input datafile
        t_low: epoch used in applying threshold_low
        t_hi : epoch used in applying threshold_hi
        """
        tmp = df.loc[(df.epoch.isin([t_low, t_hi]))]

        index_names = [
            "code_name",
            "hidden_units",
            "p_noise",
            "learning_rate",
        ]

        pvt = tmp.pivot_table(
            index=index_names, columns="epoch", values="score",
        ).reset_index()

        # Rename new columns
        pvt.columns = index_names + ["t_low", "t_hi"]
        return pvt

    def plot_control_space(self):
        """Plot selected models at control space"""

        pdf = self.df.groupby("code_name").mean().round(3).reset_index()

        control_space = (
            alt.Chart(pdf)
            .mark_rect()
            .encode(
                x="p_noise:O",
                y=alt.Y("hidden_units:O", sort="descending"),
                column=alt.Column("learning_rate:O", sort="descending"),
                color="count(code_name)",
            )
        )
        return control_space

    def plot_mean_development(self):
        """Plot the mean development of all selected models"""

        development_space_sd = (
            alt.Chart(self.df)
            .mark_errorband(extent="stdev")
            .encode(
                y=alt.Y("score:Q", scale=alt.Scale(domain=(0, 1))),
                x="epoch:Q",
                color="cond:N",
            )
            .properties(
                title="Developmental space: Accuracy in each condition over epoch"
            )
        )

        development_space_mean = development_space_sd.mark_line().encode(
            y="mean(score):Q"
        )
        return development_space_mean + development_space_sd

    def plot_all_cond_mean(self):
        """Plot the average accuracy in all conditions over epoch of all selected models"""
        group_var = ["code_name", "hidden_units", "p_noise", "learning_rate", "epoch"]
        pdf = self.df.groupby(group_var).mean().reset_index()

        dev_all_sd = (
            alt.Chart(pdf)
            .mark_errorband(extent="stdev")
            .encode(y=alt.Y("score:Q", scale=alt.Scale(domain=(0, 1))), x="epoch:Q",)
            .properties(
                title="Developmental space: Mean Accuracy in all conditions over epoch"
            )
        )

        dev_all_m = dev_all_sd.mark_line().encode(y="mean(score):Q")
        return dev_all_m + dev_all_sd

    def plot(self, title=None):
        """Plot all relevant stuffs"""

        n = len(self.df.code_name.unique())

        t = [
            "Grand mean rank: " + self.get_rankpc_desc(),
            "Grand mean acc  : " + self.get_acc_desc(),
        ]

        if title is not None:
            t = [title + f" (n={n})"] + t

        all_plot = (
            self.plot_control_space()
            & (self.plot_mean_development() | self.plot_all_cond_mean())
        ).properties(title=t)

        return all_plot

### Results: Control space selection

In [None]:
c1 = Select_Model(df)
c1.select_by_control(hidden_units=[150], p_noise=[2], learning_rate=[0.006])
c1.plot("C1: Dead center ")

In [None]:
c2 = Select_Model(df)
c2.select_by_control(
    hidden_units=[100, 150, 200], p_noise=[1, 2, 3], learning_rate=[0.004, 0.006, 0.008]
)
c2.plot("C2: Middle levels")

In [None]:
c3 = Select_Model(df)
c3.select_by_control(
    hidden_units=[100, 150, 200, 250],
    p_noise=[0, 1, 2, 3],
    learning_rate=[0.004, 0.006, 0.008, 0.01],
)
c3.plot("C3: Better")

In [None]:
c4 = Select_Model(df)
c4.select_by_control(
    hidden_units=[50, 100, 150, 200],
    p_noise=[1, 2, 3, 4],
    learning_rate=[0.002, 0.004, 0.006, 0.008],
)
c4.plot("C4: Worse")

### Results: Rank selection

In [None]:
r1 = Select_Model(df)
r1.select_by_rankpc(minpc=0.25, maxpc=0.75)
r1.plot("R1: 25-75%")

In [None]:
r2 = Select_Model(df)
r2.select_by_rankpc(minpc=0.25, maxpc=0.5)
r2.plot("R2: 25-50%")

In [None]:
r3 = Select_Model(df)
r3.select_by_rankpc(minpc=0.5, maxpc=0.75)
r3.plot("R3: 50-75%")

### Results: Accuracy selection

In [None]:
a1 = Select_Model(df)
a1.select_by_performance(t_low=0.1, threshold_low=0.8, t_hi=0.6, threshold_hi=0.9)
a1.plot("A1: Mean-ish")

In [None]:
a2 = Select_Model(df)
a2.select_by_performance(t_low=0.1, threshold_low=0.85, t_hi=0.6, threshold_hi=0.9)
a2.plot("A2: More low-end variance")

In [None]:
a3 = Select_Model(df)
a3.select_by_performance(t_low=0.1, threshold_low=0.85, t_hi=0.6, threshold_hi=0.85)
a3.plot("A3: More hi-end variance")