1. Align with accuracy instead of epoch
- One epoch that closest to 80% accuracy on PHO
2. Plot individual “network” difference beta over grid
- Taraban : y~lm(freq x cons)
- IMG-HS04 : y~lm(fxcximg)
- Nonword Glushko overall: just acc
3. Big stat model on the entire grid
- Y ~ batch_size  or epsilon check same dimensions or not… 
- y ~ lm/lmer(batch_size  or epsilon * stimprop)  | testset x
4. Also summarize DoL within the same grid [raw, same epoch at 1]
- P: intact, OP, OSP
- S: intact, OS, OPS


# Get merged data

In [None]:
import meta
import os
import pandas as pd
import numpy as np
import altair as alt
from itertools import chain
from tqdm import tqdm
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.stats.mstats import zscore

In [None]:
class Batch:
    """Batch object that take cares of the data manipulation in the results of a batch."""

    def __init__(self, batch_name: str, tf_root: str = None):
        self.batch_name = batch_name
        self.json = os.path.join("models", batch_name, "batch_config.json")
        self.tf_root = tf_root if tf_root else "./"
        self.cfg_df = self.parse_batch_config()
        self.code_names = self.cfg_df.code_name.unique().tolist()

        # Dataframe to be loaded
        self.df = None
        self.backup_df = None

    def mount_testset(self, csv: list):
        self.df = self.parse_df(csv)
        self.checkpoint_df()

    def checkpoint_df(self):
        """Make a df checkpoint copy"""
        self.backup_df = self.df.copy()

    def restore_df(self):
        """Restore self.df to the original dataframe."""
        self.df = self.backup_df

    def subset_df(
        self,
        code_name: str = None,
        epoch: int = None,
        output_name: str = None,
        timetick: list = None,
        cond: list = None,
        train_task: str = None,
    ):
        """Subset self.df to spec."""
        df = self.df
        df = df.loc[df.code_name == code_name] if code_name is not None else df
        df = df.loc[df.epoch == epoch] if epoch is not None else df
        df = df.loc[df.output_name == output_name] if output_name is not None else df
        df = df.loc[df.timetick.isin(timetick)] if timetick is not None else df
        df = df.loc[df.cond.isin(cond)] if cond is not None else df
        df = df.loc[df.train_task == train_task] if train_task is not None else df
        return df

    def subset_by_epoch_dict(self, sel_epoch: dict):
        """Return a subset of the dataframe using a epoch dictionary.
        args:
            sel_epoch: dictionary of epochs to select with k=code_name, v=epoch
        """
        dfs = [self.subset_df(code_name=k, epoch=v) for k, v in sel_epoch.items()]
        return self.concat_dfs(dfs)

    def parse_batch_config(self):
        df = meta.batch_json_to_df(self.json, tf_root=self.tf_root)
        assert (
            self.batch_name == "task_effect"
        )  # Just in case I forgot to change below line in other batches
        df["train_task"] = [
            "OP",
            "OS",
            "Triangle",
        ] * 12  # Caution: this is a hack to get around list type config, only works for this batch
        return df[["code_name", "batch_size", "learning_rate", "train_task"]]

    def parse_df(self, csv: list) -> pd.DataFrame:
        files = chain.from_iterable([self.get_eval_file_names(x) for x in csv])
        df = self.merge_from_file_names(files)
        return df.merge(self.cfg_df, on="code_name", how="left")

    def get_eval_file_names(self, csv_name: str) -> list:
        """Return a list of dataframes from a list of csvs."""
        return [
            os.path.join(
                self.tf_root, "models", self.batch_name, code_name, "eval", csv_name
            )
            for code_name in self.code_names
        ]

    def find_code_name(self, criteria: dict) -> str:
        """Return a code_name from a dictionary of criteria."""
        mask = None
        for k, v in criteria.items():
            hit = (self.cfg_df[k].isin(v)).to_list()
            mask = hit if mask is None else (a & b for a, b in zip(mask, hit))

        return self.cfg_df.code_name.loc[mask].tolist()

    def find_epoch_by_acc(self, code_name: str, acc: float) -> int:
        """Return an epoch number from an accuracy."""
        df = self.df.loc[self.df.code_name == code_name]
        df = df.groupby("epoch").mean().reset_index()  # Group by epoch
        idx = self.find_nearest(df.acc, acc)  # Find nearest accuracy
        return df.iloc[
            idx,
        ].epoch  # Return epoch

    @staticmethod
    def merge_from_file_names(filenames: list) -> list:
        """Merge a list of dataframes into one."""
        dfs = [pd.read_csv(f) for f in filenames]
        return Batch.concat_dfs(dfs)

    @staticmethod
    def concat_dfs(dfs: list) -> pd.DataFrame:
        """Return a dataframe from a list of dataframes."""
        return pd.concat(dfs, ignore_index=True).reset_index(drop=True)

    @staticmethod
    def find_nearest(array, value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return idx

    @staticmethod
    def get_acc_based_df(self, acc: float) -> pd.DataFrame:
        """Return a dataframe of accuracy for a code_name."""

        df = self.df.loc[self.df.code_name == code_name].copy()
        # Subset to nearest accuracy epoch
        sel_epoch = self.find_epoch_by_acc(code_name, 0.8)
        df = df.loc[df.epoch == sel_epoch]
        return df


b = Batch("task_effect")


# Find epoch that are closest to 80% accuracy in each network

- Define by Taraban
- at 8-12 ticks
- Train task: Triangle
- Output at PHO

In [None]:
b.mount_testset(['taraban_triangle.csv'])

sel_conds = [
    "High-frequency exception",
    "Regular control for High-frequency exception",
    "Low-frequency exception",
    "Regular control for Low-frequency exception",
    ]

b.df = b.subset_df(output_name="pho", timetick=range(8, 13), cond=sel_conds, train_task="Triangle")
b.checkpoint_df()

In [None]:
# Look for epoch that is closest to 80% accuracy
sel_epoch = {x:b.find_epoch_by_acc(x, 0.8) for x in tqdm(b.df.code_name.unique())}
print(sel_epoch)


df = b.subset_by_epoch_dict(sel_epoch)
df["freq"] = df.cond.apply(
    lambda x: "High"
    if x
    in ("High-frequency exception", "Regular control for High-frequency exception")
    else "Low"
)
df["reg"] = df.cond.apply(
    lambda x: "Regular" if x.startswith("Regular") else "Exception"
)
df = df[['batch_size', 'learning_rate', 'code_name', 'epoch', 'timetick', 'freq', 'reg', 'word', 'acc', 'sse']]
df.to_csv(os.path.join('issues', '0_batchsize_lr', 'taraban80.csv'))



## Selection quality

In [None]:

acc_txt = alt.Chart(mdf).mark_text(dy=6).encode(
    x='learning_rate:O',
    y=alt.Y('batch_size:O'),
    text=alt.Text('mean(acc):Q', format='.2f'),
).properties(title = f"Selected epoch and mean accuracy in Taranban testset", width=200, height=200)

epoch_txt = acc_txt.mark_text(dy=-6).encode(
    text=alt.Text('mean(epoch):Q', format='.0f'),
)

heatmap = acc_txt.mark_rect().encode(
    color="mean(acc):Q"
)

heatmap + acc_txt + epoch_txt

## Descriptives

In [None]:
df['reg_num'] = df.reg.apply(lambda x: 0.5 if x == 'Regular' else -0.5)
df['freq_num'] = df.freq.apply(lambda x: 0.5 if x == 'High' else -0.5)

mdf = df.groupby(['batch_size', 'learning_rate', 'code_name', 'freq', 'reg']).mean().reset_index()

def plot_taraban(df, metric: str = 'acc'):
    metric_specific_scale = alt.Scale(domain=(0, 1)) if metric == "acc" else alt.Scale()
    return alt.Chart(df).mark_line().encode(
            x=alt.X("freq:N", scale=alt.Scale(reverse=True)),
            y=alt.Y(f"mean({metric}):Q", scale=metric_specific_scale),
            row="batch_size:O",
            column="learning_rate:O",
            color="reg:N",
        ).properties(width=150, height=150)


plot_taraban(mdf, 'acc')

## Inferential statistics on mean accuracy

In [None]:
m = smf.glm(formula='zscore(acc) ~ zscore(learning_rate) * zscore(batch_size) * reg_num * freq_num', data=mdf).fit()
print(m.summary())

## Visualize beta on grid

In [None]:
def get_taraban_params(df, code_name):
    m = smf.glm(formula="acc ~ freq_num * reg_num", data=df.loc[df.code_name == code_name], family=sm.families.Binomial()).fit()
    p = m.params
    p['code_name'] = code_name
    return pd.DataFrame(p).T


In [None]:
# Get all betas
setting_map = mdf[['code_name', 'batch_size', 'learning_rate']].groupby(['code_name']).mean().reset_index()
params = [get_taraban_params(df, code_name=x) for x in tqdm(df.code_name.unique())]
taraban_beta = pd.concat(params, ignore_index=True)
taraban_beta = taraban_beta.merge(setting_map, on='code_name')
taraban_beta

In [None]:
taraban_beta.columns = ['intercept', 'freq_effect', 'reg_effect', 'interactions', 'code_name', 'batch_size', 'epsilon']
taraban_beta = taraban_beta.melt(id_vars=['code_name', 'batch_size', 'epsilon'], value_vars=['intercept', 'freq_effect', 'reg_effect', 'interactions'])

In [None]:
alt.Chart(taraban_beta).mark_rect().encode(
    x='epsilon:O',
    y='batch_size:O',
    color=alt.Color('value:Q', scale=alt.Scale(domain=(-25, 25), scheme='redblue')),
    column='variable:N',
).properties(width=200, height=200)

# Nonword

In [None]:
b.mount_testset(['glushko_triangle.csv'])
b.df = b.subset_df(output_name="pho", timetick=range(8, 13), train_task="Triangle")
b.checkpoint_df()

In [None]:
df = b.subset_by_epoch_dict(sel_epoch)

In [None]:
mdf = df.groupby(['batch_size', 'learning_rate', 'code_name', 'cond']).mean().reset_index()
mdf['cond_num'] = mdf.cond.apply(lambda x: 0.5 if x == 'Regular' else -0.5)

In [None]:
alt.Chart(mdf).mark_rect().encode(
    x='learning_rate:O',
    y='batch_size:O',
    color=alt.Color('acc:Q', scale=alt.Scale(domain=(0, 1))),
    column='cond:N',
).properties(width=200, height=200)

In [None]:
m = smf.glm(formula='zscore(acc) ~ zscore(learning_rate) * zscore(batch_size) * cond_num ', data=mdf).fit()
print(m.summary())

# Img-HS04

In [None]:
b.mount_testset(['hs04_img_240_triangle.csv'])
b.df = b.subset_df(output_name="pho", timetick=range(8, 13), train_task="Triangle")

In [None]:
df = b.subset_by_epoch_dict(sel_epoch)
b.checkpoint_df()

In [None]:
df[['freq', 'op', 'img']] = df.cond.str.split('_', expand=True)

In [None]:
df['freq_num'] = df.freq.apply(lambda x: 0.5 if x == 'hf' else -0.5)
df['op_num'] = df.op.apply(lambda x: 0.5 if x == 'ls' else -0.5)
df['img_num'] = df.img.apply(lambda x: 0.5 if x == 'hi' else -0.5)

In [None]:
mdf = df.groupby(['batch_size', 'learning_rate', 'code_name', 'cond']).mean().reset_index()

In [None]:
mdf

In [None]:
alt.Chart(mdf).mark_rect().encode(
    x='learning_rate:O',
    y='batch_size:O',
    color=alt.Color('mean(acc):Q', scale=alt.Scale(domain=(0, 1))),
).properties(title="Mean accuracy in IMG testset", width=200, height=200)

In [None]:
m = smf.glm(formula='zscore(acc) ~ zscore(learning_rate) * zscore(batch_size) * freq_num * op_num * img_num ', data=mdf).fit()
print(m.summary())

In [None]:
x = pd.DataFrame(m.summary().tables[1][1:], columns=['lable', 'coef', 'se', 'z', 'p', 'lci', 'uci'])
x.to_csv('tmp.csv')

In [None]:
def get_img_params(df, code_name):
    m = smf.glm(formula="acc ~ freq_num * op_num * img_num", data=df.loc[df.code_name == code_name], family=sm.families.Binomial()).fit()
    p = m.params
    p['code_name'] = code_name
    return pd.DataFrame(p).T

In [None]:
params = [get_img_params(df, code_name=x) for x in tqdm(df.code_name.unique())]
img_beta = pd.concat(params, ignore_index=True)
setting_map = mdf[['code_name', 'batch_size', 'learning_rate']].groupby(['code_name']).mean().reset_index()
img_beta = img_beta.merge(setting_map, on='code_name')
img_beta

In [None]:
img_beta.columns = ['intercept', 'freq_effect', 'reg_effect', 'fxr', 'img_effect', 'fxi', 'rxi', 'fxrxi', 'code_name', 'batch_size', 'epsilon']
img_beta = img_beta.melt(id_vars=['code_name', 'batch_size', 'epsilon'], value_vars=['intercept', 'freq_effect', 'reg_effect', 'fxr', 'img_effect', 'fxi', 'rxi', 'fxrxi'])

In [None]:
alt.Chart(img_beta).mark_rect().encode(
    x='epsilon:O',
    y='batch_size:O',
    color=alt.Color('value:Q', scale=alt.Scale(domain=(-25, 25), scheme='redblue')),
    column='variable:N',
).properties(width=200, height=200)

# DoL

### PHO output

In [None]:
b.mount_testset(['train_r100_ort_pho.csv', 'train_r100_exp_osp.csv', 'train_r100_triangle.csv'])
b.df = b.subset_df(timetick=range(8, 13), output_name='pho', train_task="Triangle")
df = b.subset_by_epoch_dict(sel_epoch)
b.checkpoint_df()

In [None]:
dol_pho_mdf = df.groupby(['batch_size', 'learning_rate', 'code_name', 'task']).mean().reset_index()

In [None]:
alt.Chart(dol_pho_mdf).mark_rect().encode(
    x='learning_rate:O',
    y='batch_size:O',
    color=alt.Color('acc:Q', scale=alt.Scale(domain=(0, 1))),
    column='task:N',
).properties(width=200, height=200)

In [None]:
b.mount_testset(['cos_train_r100_ort_sem.csv', 'cos_train_r100_exp_ops.csv', 'cos_train_r100_triangle.csv'])
b.df = b.subset_df(timetick=range(8, 13), output_name='sem', train_task="Triangle")
df = b.subset_by_epoch_dict(sel_epoch)
b.checkpoint_df()

In [None]:
dol_sem_mdf = df.groupby(['batch_size', 'learning_rate', 'code_name', 'task']).mean().reset_index()

In [None]:
alt.Chart(dol_sem_mdf).mark_rect().encode(
    x='learning_rate:O',
    y='batch_size:O',
    color=alt.Color('acc:Q', scale=alt.Scale(domain=(0, 1))),
    column='task:N',
).properties(width=200, height=200)