# Evaluation 3.0

- speed
- GCP BQ support
- support for v4 model

## Consideration
- Everything on tensorboard is convienient and fast af. 
- Need item level details down the line --> which BigQuery comes into play, but not so important until triangle model v4 is stable
- currently I am doing things in between, store data locally per model, then aggregate mean level statistic if batch run (varying h-param or multi runs). 
- I already almost coded everything in bit and pieces, just need to have a better integration

# Examine one model

In [None]:
# %load_ext lab_black
import os
import altair as alt
import pandas as pd
import numpy as np
import tensorflow as tf
import meta, data_wrangling, modeling, metrics, evaluate
from importlib import reload

In [None]:
reload(evaluate)
reload(data_wrangling)
reload(metrics)

code_name = "triangle_with_strain"

cfg = meta.ModelConfig.from_json(os.path.join("models", code_name, "model_config.json"))

model = modeling.MyModel(cfg)
checkpoint = cfg.path["weights_checkpoint_fstring"].format(epoch=250)
model.load_weights(checkpoint)

data = data_wrangling.MyData()

In [None]:
model.set_active_task("triangle")
y_pred = model([data.testsets["strain"]["ort"]] * cfg.n_timesteps)
y_true = {out: data.testsets["strain"][out] for out in ('pho', 'sem')}

pho_acc = metrics.PhoAccuracy()
pho_sse = metrics.SumSquaredError()
sem_acc = metrics.RightSideAccuracy()
sem_sse = metrics.SumSquaredError()

pho_acc.update_state(y_true['pho'], y_pred['pho'][-1])
pho_sse.update_state(y_true['pho'], y_pred['pho'][-1])
sem_acc.update_state(y_true['sem'], y_pred['sem'][-1])
sem_sse.update_state(y_true['sem'], y_pred['sem'][-1])
print(f"pho accuracy:{pho_acc.out.numpy():04f}, sem accuracy:{sem_acc.out.numpy():04f}")
print(f"pho sse:{pho_sse.out.numpy():04f}, sem sse:{sem_sse.out.numpy():04f}")

# Proto type testset implemetation manually
We need a vectorized map at these dimensions:
- model (1 for now)
- epoch (39)
- timestep (11)
- testset x cond (taraban, glushko, hs04 img)
- task (9, 5 main, 4 experimental)
- output (2 in triangle, otherwise 1)
- metrics (acc, sse, cosine) 

In [None]:
model.set_active_task("ort_sem")
y_pred = model([data.testsets["strain"]["ort"]] * cfg.n_timesteps)
y_true = data.testsets["strain"]["sem"]
sem_acc.update_state(y_true, y_pred['sem'][-1])
sem_sse.update_state(y_true, y_pred['sem'][-1])
print(f"sem accuracy:{sem_acc.out.numpy():04f}, sse:{sem_sse.out.numpy()}")

In [None]:
model.set_active_task("exp_ops")
y_pred = model([data.testsets["strain"]["ort"]] * cfg.n_timesteps)
y_true = data.testsets["strain"]["sem"]
sem_acc.update_state(y_true, y_pred['sem'][-1])
sem_sse.update_state(y_true, y_pred['sem'][-1])
print(f"sem accuracy:{sem_acc.out.numpy():04f}, sse:{sem_sse.out.numpy()}")

In [None]:
model.set_active_task("ort_pho")
y_pred = model([data.testsets["strain"]["ort"]] * cfg.n_timesteps)
y_true = data.testsets["strain"]["pho"]
pho_acc.update_state(y_true, y_pred['pho'][-1])
pho_sse.update_state(y_true, y_pred['pho'][-1])
print(f"pho accuracy:{pho_acc.out.numpy():04f}, pho sse:{pho_sse.out.numpy():04f}")

In [None]:
model.set_active_task("exp_osp")
y_pred = model([data.testsets["strain"]["ort"]] * cfg.n_timesteps)
y_true = data.testsets["strain"]["pho"]
pho_acc.update_state(y_true, y_pred['pho'][-1])
pho_sse.update_state(y_true, y_pred['pho'][-1])
print(f"pho accuracy:{pho_acc.out.numpy():04f}, pho sse:{pho_sse.out.numpy():04f}")


- timestep (11)
- testset x cond (taraban, glushko, hs04 img)
- task (9, 5 main, 4 experimental)
- output (2 in triangle, otherwise 1)
- metrics (acc, sse, cosine) 

data.testset object:
'ort': shape = (n items, ort_units)
'pho': shape = () 

### Steps
1. 


In [None]:
data.testsets['strain']

In [None]:
from tqdm import tqdm

class TestSet:
    """Universal test set object for evaluating model results
    1. Single condition, single metric, single value output for maximum capatibility
    2. Model level info should be stored at separate table, and merge it in the end
    """

    METRICS_MAP = {
            'pho':{'acc': metrics.PhoAccuracy(), 'sse': metrics.SumSquaredError()},
            'sem':{'acc': metrics.RightSideAccuracy(), 'sse': metrics.SumSquaredError()},
    }

    def __init__(self, cfg, model):
        self.cfg = cfg
        self.model = model
        
    def eval(self, testset_name, task):
        df = pd.DataFrame()
        ts_path = '/home/jupyter/tf/dataset/testsets'
        testset_package = data_wrangling.load_testset(os.path.join(ts_path, f"{testset_name}.pkl.gz"))
        self.model.set_active_task(task)

        # for epoch in tqdm(self.cfg.saved_epoches):
        for epoch in tqdm(range(1, 4)):
            w = self.cfg.path['weights_checkpoint_fstring'].format(epoch=epoch)
            self.model.load_weights(w)
            y_pred = self.model([testset_package[modeling.IN_OUT[task][0]]] * self.cfg.n_timesteps)
            
            for timetick_idx in range(self.cfg.output_ticks):
                if task == 'triangle':
                    for output_name in ('pho', 'sem'):
                        tag = {
                                'code_name': self.cfg.code_name,
                                'epoch': epoch,
                                'testset': testset_name,
                                'task': task,
                                'output_name': output_name,
                                'timetick_idx': timetick_idx,
                                'timetick': self.output_idx_to_timetick(timetick_idx),
                                'word': testset_package['item']
                        }

                        df = df.append(self._eval_one(y_pred, y_true, tag), ignore_index=True)

                else:
                    output_name = modeling.IN_OUT[task][1]
                    tag = {
                            'code_name': self.cfg.code_name,
                            'epoch': epoch,
                            'testset': testset_name,
                            'task': task,
                            'output_name': output_name,
                            'timetick_idx': timetick_idx,
                            'timetick': self.output_idx_to_timetick(timetick_idx),
                            'word': testset_package['item']
                        }
                    df = df.append(self._eval_one(y_pred, y_true, tag), ignore_index=True)

        return df
    

    def output_idx_to_timetick(self, idx):
        # Zero indexing idx to one indexing step
        d = self.cfg.n_timesteps - self.cfg.output_ticks
        return idx + 1 + d 


    def _eval_one(self, y_pred, y_true, tag):
        """
        y_pred: predition dictionary, e.g., {'pho': (time ticks, items, output nodes)}
        y_true: label dictionary (time invarying), e.g., {'sem': (items, maybe n ans. output nodes)}
        """
        out = pd.DataFrame()
        this_y_pred = y_pred[tag['output_name']][tag['timetick_idx']]
        # shape: (time ticks, items, output nodes)

        this_y_true = y_true[tag['output_name']]
        # shape: (item, *maybe n ans, output nodes)

        acc = self.METRICS_MAP[tag['output_name']]['acc']

        if tf.rank(this_y_true) == 3:
            # Multi ans mode
            out['acc'] = acc.item_metric_multi_ans(this_y_true, this_y_pred)
        else:
            # Single ans mode
            out['acc'] = acc.item_metric(this_y_true, this_y_pred)

        # Write tag to df
        for k, v in tag.items():
            out[k] = v

        return out


In [None]:
x = TestSet(cfg, model)
x.eval('strain', 'ort_pho')


# Model level examine class (After eval)

In [None]:
class examine:
    
    def __init__(self, code_name, tf_root="/home/jupyter/tf"):

        try:
            # Fast load from disk
            csv_file = os.path.join(tf_root, 'models', code_name, 'eval', 'strain_mean_df.csv')
            self.df = pd.read_csv(csv_file)
        except:
            # Eval from scratch
            self.cfg = meta.ModelConfig.from_json(os.path.join(tf_root, 'models', code_name, 'model_config.json'))
            self.data = data_wrangling.MyData()
            self.model = modeling.HS04Model(self.cfg)
            self.model.build()
            self.test_strain = evaluate.EvalOral(self.cfg, self.model, self.data)
            self.df = self.test_strain.strain_mean_df

    def plot_op_strain(self):
        df = self.df

        @interact(
            use_y=['acc','sse','conditional_sse'],
            timetick=(1,12,1),
            y_max=(1, 20, 1)
            )
        def plot(use_y='acc', timetick=12, y_max=1):
            sdf = df.loc[(df.timetick==timetick)] 
            
            # Plot by condition
            plot_by_cond = alt.Chart(sdf).mark_line().encode(
                x=alt.X('epoch:Q', scale=alt.Scale(domain=(0, 100), clamp=True)),
                y=alt.Y(f"{use_y}:Q", scale=alt.Scale(domain=(0, y_max))),
                color='cond:N'
            )

            # Contrasts
            contrasts = {}
            contrasts['contrast_frequency'] = """(datum.HF_INC + datum.HF_CON - (datum.LF_INC + datum.LF_CON))/2""" 
            contrasts['contrast_consistency'] = """(datum.LF_CON + datum.HF_CON - (datum.LF_INC + datum.HF_INC))/2""" 

            def create_contrast_plot(name):
                return plot_by_cond.encode(y=alt.Y("difference:Q", scale=alt.Scale(domain=(-y_max, y_max)))
                    ).transform_pivot('cond', value=use_y, groupby=['epoch']
                    ).transform_calculate(difference = contrasts[name]
                    ).properties(title=name)

            return plot_by_cond | create_contrast_plot('contrast_frequency') | create_contrast_plot('contrast_consistency')

    def plot(self):
        """ Create an interactive plot for strain """
        df = self.df

        @interact(
            use_y=['acc','sse','conditional_sse'],
            task=['pho_sem', 'sem_pho', 'pho_pho', 'sem_sem'],
            timetick=(1,12,1),
            y_max=(1, 20, 1)
            )
        def plot(use_y='acc', timetick=12, task='pho_sem', y_max=1):
            sdf = df.loc[(df.timetick==timetick) & (df.task==task)] 
            
            # Plot by condition
            plot_by_cond = alt.Chart(sdf).mark_line().encode(
                x='epoch:Q',
                y=alt.Y(f"{use_y}:Q", scale=alt.Scale(domain=(0, y_max))),
                color='testset:N'
            )

            # Plot average
            plot_average = plot_by_cond.encode(y=alt.Y(f"mean({use_y}):Q", scale=alt.Scale(domain=(0, y_max))), color='task')
            plot_average += plot_average.mark_errorband()

            # Plot contrasts
            contrasts = {}
            contrasts['contrast_frequency'] = """(datum.strain_hf_con_hi + datum.strain_hf_con_li + datum.strain_hf_inc_hi + datum.strain_hf_inc_li - 
                (datum.strain_lf_con_hi + datum.strain_lf_con_li + datum.strain_lf_inc_hi + datum.strain_lf_inc_li))/4"""
            contrasts['contrast_consistency'] = """(datum.strain_hf_con_hi + datum.strain_hf_con_li + datum.strain_lf_con_hi + datum.strain_lf_con_li - 
                (datum.strain_hf_inc_hi + datum.strain_hf_inc_li + datum.strain_lf_inc_hi + datum.strain_lf_inc_li))/4"""
            contrasts['contrast_imageability'] = """(datum.strain_hf_con_hi + datum.strain_lf_con_hi + datum.strain_hf_inc_hi + datum.strain_lf_inc_hi - 
                (datum.strain_hf_con_li + datum.strain_lf_con_li + datum.strain_hf_inc_li + datum.strain_lf_inc_li))/4"""

            def create_contrast_plot(name):
                return plot_by_cond.encode(y=alt.Y("difference:Q", scale=alt.Scale(domain=(-y_max, y_max)))
                    ).transform_pivot('testset', value=use_y, groupby=['epoch']
                    ).transform_calculate(difference = contrasts[name]
                    ).properties(title=name)

            contrast_plots = alt.hconcat()
            for c in contrasts.keys():
                contrast_plots |= create_contrast_plot(c)


            return((plot_by_cond | plot_average) & contrast_plots)


In [None]:
tmp = examine('boo')
tmp.plot_op_strain()
# Full looks familiar... good interaction, fast learning overall (will slow down later, using a fast learning rate to save time on testing)

In [None]:
tmp = examine('op_half_stationary')
tmp.plot_op_strain()
# Learn slower... 
# HF_INC seems a tiny bit lower (more apparant in earlier ticks), maybe HF item has more CON O-P tokens?



In [None]:
tmp = examine('op_half_rank_noclip')
tmp.plot_op_strain()
# HF_INC further decrease --> CON > F



In [None]:
tmp = examine('op_half_rank_hc_30000')
tmp.plot_op_strain()
# Strong frequency effect



## Strain

In [None]:
# Half-pretrain (Chang 2019)

half_pretrain = examine("half_pretrain")
half_pretrain.plot()


In [None]:
# Chang 2019

chang_pretrain = examine("chang_pretrain")
chang_pretrain.plot()

In [None]:
# Full-pretrain 
full_pretrain = examine("full_pretrain")
full_pretrain.plot()