# Analyze Station3 batch

## Purposes
- Quatify and Visualize over performance space

In [None]:
import os
from importlib import reload
import batch_utils
import tensorflow as tf
import pandas as pd
import sqlite3
import altair as alt
from scipy.stats.mstats import zscore
import statsmodels.formula.api as smf
import statsmodels.api as sm
from tqdm import tqdm
alt.data_transformers.disable_max_rows()

In [None]:
batch_name = 'station_3'
con = sqlite3.connect(f"models/{batch_name}/results.db")
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

In [None]:
query = """
SELECT * FROM TARABAN
"""

with sqlite3.connect(f"models/{batch_name}/results.db") as c:
    taraban = pd.read_sql(query, con=c)


In [None]:
taraban.word

# Use Taraban as a reference to align epoch

In [None]:
df = taraban.loc[taraban['timetick'].isin(range(8, 13))]
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'output_name']).mean().reset_index()

alt.Chart(df).mark_line(point=True).encode(
    x='epoch:Q',
    y='mean(acc):Q',
    column='learning_rate:Q',
    row='batch_size:Q',
    color='output_name:N'
).save('Taraban_acc.html')

acc_df = df

In [None]:
def epoch_to_mean_acc(batch_size, learning_rate, epoch):
    return acc_df.loc[(acc_df.epoch==epoch) & 
        (acc_df.output_name=='pho') & 
        (acc_df.batch_size==batch_size) & 
        (acc_df.learning_rate==learning_rate), 'acc'].mean()

## RUN cell GLM

In [None]:
# Tidying up
df = taraban.loc[taraban['timetick'].isin(range(8, 13)) & (taraban['output_name'] == 'pho')].copy()
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'output_name', 'freq', 'reg']).mean().reset_index()
df['reg_num'] = df.reg.apply(lambda x: 0.5 if x == 'Regular' else -0.5)
df['freq_num'] = df.freq.apply(lambda x: 0.5 if x == 'High' else -0.5)

In [None]:
def get_taraban_beta(df: pd.DataFrame, batch_size:int, learning_rate:float, epoch:int, metric='acc', standardize=False) -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch)]

    y = f'zscore({metric})' if standardize else metric  # pick y

    try:
        m = smf.glm(formula=f"{y} ~ freq_num * reg_num", data=df).fit()
        p = m.params
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['epoch'] = epoch

        p['acc'] = epoch_to_mean_acc(batch_size, learning_rate, epoch)
        return pd.DataFrame(p).T
    except Exception as e:
        pass


In [None]:
epochs = list(df.epoch.unique())
batch_sizes = list(df.batch_size.unique())
learning_rates = list(df.learning_rate.unique())

def run_taraban(metric, standardize):

    bdf = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                bdf = bdf.append(get_taraban_beta(df, batch_size, learning_rate, epoch, metric, standardize))

    
    mdf = bdf.melt(id_vars=['batch_size', 'learning_rate', 'acc'], 
        value_vars=['Intercept', 'reg_num', 'freq_num', 'freq_num:reg_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='acc:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in Taraban (z:{standardize}, y:{metric}) ')


In [None]:
run_taraban('acc', True).save('Taraban_beta_zacc.html')
run_taraban('acc', False).save('Taraban_beta_acc.html')
run_taraban('sse', True).save('Taraban_beta_zsse.html')
run_taraban('sse', False).save('Taraban_beta_sse.html')

# Lexicality

In [None]:
batch_name = 'station_3'

query = """
SELECT * FROM LEXICALITY
"""

with sqlite3.connect(f"models/{batch_name}/results.db") as c:
    lex = pd.read_sql(query, con=c)


### Lexicality effect over epoch

In [None]:
df = lex.loc[(lex.timetick.isin(range(8, 13))) & (lex.output_name == 'pho')]
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'cond']).mean().reset_index()

df['lex_num'] = df.cond.apply(lambda x: 0.5 if x == 'word' else -0.5)

alt.Chart(df).mark_line(point=True).encode(
    x='epoch:Q',
    y='mean(acc):Q',
    column='learning_rate:Q',
    row='batch_size:Q',
    color='cond:N'
).save('lexicality_over_epoch.html')


### NW vs. W

In [None]:
pdf = df.pivot_table(index=['code_name', 'batch_size', 'learning_rate', 'epoch'], columns='cond', values='acc').reset_index()

In [None]:
alt.Chart(pdf).mark_point().encode(
    x=alt.X('word:Q', scale=alt.Scale(domain=[0,1])),
    y=alt.Y('nonword:Q', scale=alt.Scale(domain=[0,1])),
    column='learning_rate:Q',
    row='batch_size:Q',
    color='code_name:N'
).save('nonword_word.html')

In [None]:
def epoch_to_mean_acc(batch_size, learning_rate, epoch):
    return acc_df.loc[(acc_df.epoch==epoch) & 
        (acc_df.output_name=='pho') & 
        (acc_df.batch_size==batch_size) & 
        (acc_df.learning_rate==learning_rate), 'acc'].mean()

In [None]:
def get_lex_beta(df: pd.DataFrame, batch_size:int, learning_rate:float, epoch:int, metric='acc', standardize=False) -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch)]
    y = f'zscore({metric})' if standardize else metric  # pick y

    try:
        m = smf.glm(formula=f"{y} ~ lex_num", data=df).fit()
        p = m.params
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['epoch'] = epoch

        p['acc'] = epoch_to_mean_acc(batch_size, learning_rate, epoch)
        return pd.DataFrame(p).T
    except Exception as e:
        pass

In [None]:
epochs = list(df.epoch.unique())
batch_sizes = list(df.batch_size.unique())
learning_rates = list(df.learning_rate.unique())

def run_lex(metric, standardize):

    bdf = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                bdf = bdf.append(get_lex_beta(df, batch_size, learning_rate, epoch, metric, standardize))

    
    mdf = bdf.melt(id_vars=['batch_size', 'learning_rate', 'acc'], 
        value_vars=['Intercept', 'lex_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='acc:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in Taraban (z:{standardize}, y:{metric}) ')

In [None]:
run_lex('acc', False).save('Lexicality_beta_acc.html')
run_lex('acc', True).save('Lexicality_zbeta_acc.html')
run_lex('csse', False).save('Lexicality_beta_csse.html')
run_lex('csse', True).save('Lexicality_zbeta_csse.html')

# Imageability

In [None]:
query = """
SELECT * FROM imageability
"""

with sqlite3.connect(f"models/{batch_name}/results.db") as c:
    img = pd.read_sql(query, con=c)


In [None]:
df = img.loc[(img.timetick.isin(range(8, 13)))]
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'cond', 'output_name']).mean().reset_index()

df[["freq", "op", "img"]] = df.cond.str.split("_", expand=True)
df["fc"] = df.cond.apply(lambda x: x[:5])
df["freq_num"] = df.freq.apply(lambda x: 0.5 if x == "hf" else -0.5)
df["op_num"] = df.op.apply(lambda x: 0.5 if x == "ls" else -0.5)
df["img_num"] = df.img.apply(lambda x: 0.5 if x == "hi" else -0.5)


In [None]:
def get_img_beta(df: pd.DataFrame, output_name: str, batch_size:int, learning_rate:float, epoch:int, metric='acc', standardize=False) -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch) & (df.output_name == output_name)]
    y = f'zscore({metric})' if standardize else metric  # pick y

    try:
        m = smf.glm(formula=f"{y} ~ img_num", data=df).fit()
        p = m.params
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['epoch'] = epoch

        p['acc'] = epoch_to_mean_acc(batch_size, learning_rate, epoch)
        return pd.DataFrame(p).T
    except Exception as e:
        pass

In [None]:
epochs = list(df.epoch.unique())
batch_sizes = list(df.batch_size.unique())
learning_rates = list(df.learning_rate.unique())

def run_img(output_name, metric, standardize):

    bdf = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                bdf = bdf.append(get_img_beta(df, output_name, batch_size, learning_rate, epoch, metric, standardize))

    
    mdf = bdf.melt(id_vars=['batch_size', 'learning_rate', 'acc'], 
        value_vars=['Intercept', 'img_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='acc:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in IMG (out: {output_name} z:{standardize}, y:{metric}) ')

In [None]:
run_img(output_name='pho', metric='acc', standardize=False).save('pho_IMG_beta_acc.html')
run_img(output_name='pho', metric='acc', standardize=True).save('pho_IMG_zbeta_acc.html')
run_img(output_name='pho', metric='csse', standardize=False).save('pho_IMG_beta_csse.html')
run_img(output_name='pho', metric='csse', standardize=True).save('pho_IMG_zbeta_csse.html')

In [None]:
run_img(output_name='sem', metric='acc', standardize=False).save('SEM_IMG_beta_acc.html')
run_img(output_name='sem', metric='acc', standardize=True).save('SEM_IMG_zbeta_acc.html')
run_img(output_name='sem', metric='csse', standardize=False).save('SEM_IMG_beta_csse.html')
run_img(output_name='sem', metric='csse', standardize=True).save('SEM_IMG_zbeta_csse.html')

# Swap to continuous regressors 

In [None]:
import meta
import pandas as pd
import numpy as np
from scipy.stats.mstats import zscore
import statsmodels.formula.api as smf
import statsmodels.api as sm
import evaluate
import altair as alt

In [None]:
cfg.tf_root

In [None]:
code_name = "tmp_64_005"
cfg = meta.Config.from_json(f"models/{code_name}/model_config.json")
test = evaluate.TestSet(cfg)

In [None]:
df = test.eval_train('triangle')
df = df.loc[(df.timetick.isin(range(8, 13))) & (df.output_name == 'pho')]
df = df.groupby(['epoch', 'word']).mean().reset_index()
df = df[['epoch', 'word', 'acc', 'sse']].dropna()

In [None]:
surprisal = pd.read_csv("/home/jupyter/triangle_model/corpus/noam_surprisal.csv")
word2op_dict = {word: op for word, op in zip(surprisal.word, surprisal["uncond.surprisal"])}

df_train = pd.read_csv("/home/jupyter/triangle_model/dataset/df_train.csv")
word2wf_dict = {word: wf for word, wf in zip(df_train.word, df_train.wf)}

def word2op(word):
    try:
        return word2op_dict[word]
    except:
        return None

def word2wf(word):
    try:
        return np.log10(word2wf_dict[word] + 1)
    except:
        return None

In [None]:
df['wf'] = df.word.apply(lambda x: word2wf(x))
df['op'] = df.word.apply(lambda x: word2op(x))
df = df.dropna()
df['csse'] = df.sse.loc[df.acc == 1]

In [None]:
# Mean acc in each epoch
m_acc_epoch = df.groupby(['epoch']).mean().reset_index()
epo_acc = {epoch: acc for epoch, acc in zip(m_acc_epoch.epoch, m_acc_epoch.acc)}
df['epo_macc'] = df.epoch.apply(lambda x: epo_acc[x])

In [None]:
def get_fc_beta(df: pd.DataFrame, epoch:int, metric='acc') -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    # df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch)]
    sdf = df.loc[(df.epoch == epoch)].dropna()
    m = smf.glm(formula=f"{metric} ~ zscore(op) * zscore(wf) + 0", family=sm.families.Binomial(), data=sdf).fit()
    p = m.params
    p['epoch'] = epoch

    return pd.DataFrame(p).T


In [None]:
zbeta_acc = pd.concat([get_fc_beta(df, epoch=i, metric='acc') for i in cfg.saved_epochs], ignore_index=True)
zbeta_acc_long = zbeta_acc.melt(id_vars=['epoch'], var_name='param', value_name='beta')
zbeta_acc_long['acc'] = zbeta_acc_long.epoch.apply(lambda x: epo_acc[x])

In [None]:
zbeta_csse = pd.concat([get_fc_beta(df, epoch=i, metric='csse') for i in cfg.saved_epochs], ignore_index=True)
zbeta_csse_long = zbeta_csse.melt(id_vars=['epoch'], var_name='param', value_name='beta')
zbeta_csse_long['acc'] = zbeta_csse_long.epoch.apply(lambda x: epo_acc[x])

In [None]:
dev = alt.Chart(zbeta_long).mark_line(point=True).encode(
    x="epoch:Q",
    y="beta:Q",
    color="param:N")

per = dev.encode(x="acc:Q")

dev | per

In [None]:
df_60 = df.loc[df.epoch == 60].copy()

In [None]:
df_60['wf_gp'] = df_60.wf.apply(lambda x: 'HF' if x > df_60.wf.median() else 'LF')
df_60['wf_op'] = df_60.op.apply(lambda x: 'INC' if x > df_60.op.median() else 'CON')

In [None]:
alt.data_transformers.disable_max_rows()
alt.Chart(df_60).mark_line().encode(
    x='wf_gp:N',
    y='mean(acc):Q',
    color='wf_op:N'
).properties(width=200, height=200)

# Full grid F X C

In [None]:
import pandas as pd
import numpy as np
import meta
import json
from scipy.stats.mstats import zscore
import statsmodels.formula.api as smf
import statsmodels.api as sm
import altair as alt

### Retrieve batch settings

In [None]:
json_file = "models/station_3/batch_config.json"

with open(json_file) as f:
    batch_cfgs = json.load(f)

all_params = [pd.DataFrame(cfg["params"]) for cfg in batch_cfgs if type(cfg["params"].values()) is not list]
cfgs = pd.concat(all_params, ignore_index=True)
cfgs = cfgs.groupby(['code_name', 'batch_size', 'learning_rate']).mean().reset_index()
cfgs = cfgs[['code_name', 'batch_size', 'learning_rate']]

In [None]:
%load_ext google.cloud.bigquery

In [None]:
# %%bigquery df
# SELECT
#   code_name, epoch, word, acc, sse 
# FROM
#   `majestic-camp-303620.station_3.train`
# WHERE
#   timetick = 12
#   AND output_name = 'pho'

In [None]:
# df.to_csv("models/station_3/pho_lasttick.csv")

In [None]:
df = pd.read_csv("models/station_3/pho_lasttick.csv")

In [None]:
surprisal = pd.read_csv("/home/jupyter/triangle_model/corpus/noam_surprisal.csv")
word2op_dict = {word: op for word, op in zip(surprisal.word, surprisal["uncond.surprisal"])}

df_train = pd.read_csv("/home/jupyter/triangle_model/dataset/df_train.csv")
word2wf_dict = {word: wf for word, wf in zip(df_train.word, df_train.wf)}

def word2op(word):
    try:
        return word2op_dict[word]
    except:
        return None

def word2wf(word):
    try:
        return np.log10(word2wf_dict[word] + 1)
    except:
        return None

In [None]:
# calculate csse
df['csse'] = df.sse.loc[df.acc == 1]

# Get wf and op for each word
df['wf'] = df.word.apply(lambda x: word2wf(x))
df['op'] = df.word.apply(lambda x: word2op(x))

# Get batch size and learning rate
df = df.merge(cfgs, on=['code_name'], how='left')

In [None]:
# Checkpoint
df.to_csv("models/station_3/pho_lasttick.csv")

In [None]:
def get_beta_acc(df: pd.DataFrame, code_name:str, epoch:int) -> pd.DataFrame:
    sdf = df.loc[(df.epoch == epoch) & (df.code_name == code_name)]
    batch_size = sdf.batch_size.unique()[0]
    learning_rate = sdf.learning_rate.unique()[0]

    sdf = sdf[['word', 'acc', 'op', 'wf']].dropna()
    
    m = smf.glm(formula="acc ~ zscore(op) * zscore(wf) + 0", family=sm.families.Binomial(), data=sdf).fit()
    p = m.params
    p['epoch'] = epoch
    p['code_name'] = code_name
    p['batch_size'] = batch_size
    p['learning_rate'] = learning_rate

    return pd.DataFrame(p).T

def get_beta_csse(df: pd.DataFrame, code_name:str, epoch:int) -> pd.DataFrame:
    try: # Prevent no correact answer epochs returning error
        sdf = df.loc[(df.epoch == epoch) & (df.code_name == code_name)]
        batch_size = sdf.batch_size.unique()[0]
        learning_rate = sdf.learning_rate.unique()[0]

        sdf = sdf[['word', 'csse', 'op', 'wf']].dropna()
        
        m = smf.glm(formula="zscore(csse) ~ zscore(op) * zscore(wf) + 0", data=sdf).fit()
        p = m.params
        p['epoch'] = epoch
        p['code_name'] = code_name
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate

        return pd.DataFrame(p).T
    except:
        return None


In [None]:
def make_beta_df(df, func):
    
    epoch_acc_map = df.groupby(['code_name', 'epoch']).mean().reset_index()[['code_name', 'epoch', 'acc']]
    code_names = sorted(df.code_name.unique())
    epochs = sorted(df.epoch.unique())
    beta_df = pd.concat([func(df, code_name, epoch) for code_name in code_names for epoch in epochs], ignore_index=True)

    beta_df = beta_df.melt(id_vars=['code_name', 'epoch', 'batch_size', 'learning_rate'], var_name='param', value_name='beta')
    beta_df = pd.merge(beta_df, epoch_acc_map, on=['code_name', 'epoch'], how='left').dropna()

    return beta_df


In [None]:
csse_beta = make_beta_df(df, get_beta_csse)

In [None]:
def plot_and_save(df, file_suffix:str):
    """Plot beta and save developmental and performance space"""
    dev = alt.Chart(df).mark_line(point=True).encode(
        x="epoch:Q",
        y="beta:Q",
        color="param:N",
        row="batch_size:O",
        column="learning_rate:O"
    )

    per = dev.encode(x="acc:Q")

    dev.save(f'dev{file_suffix}.html')
    per.save(f'per{file_suffix}.html')

In [None]:
plot_and_save(csse_beta, "_csse")