# Analyze Station3 batch

## Purposes
- Quatify and Visualize over performance space

In [None]:
import os
from importlib import reload
import batch_utils
import pandas as pd
import sqlite3
import altair as alt
from scipy.stats.mstats import zscore
import statsmodels.formula.api as smf
import statsmodels.api as sm
from tqdm import tqdm
alt.data_transformers.disable_max_rows()

In [None]:
con = sqlite3.connect(f"models/{batch_name}/results.db")
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

In [None]:
batch_name = 'station_3'

query = """
SELECT * FROM TARABAN
"""

with sqlite3.connect(f"models/{batch_name}/results.db") as c:
    taraban = pd.read_sql(query, con=c)


# Use Taraban as a reference to align epoch

In [None]:
df = taraban.loc[taraban['timetick'].isin(range(8, 13))]
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'output_name']).mean().reset_index()

alt.Chart(df).mark_line(point=True).encode(
    x='epoch:Q',
    y='mean(acc):Q',
    column='learning_rate:Q',
    row='batch_size:Q',
    color='output_name:N'
).save('Taraban_acc.html')

acc_df = df

In [None]:
def epoch_to_mean_acc(batch_size, learning_rate, epoch):
    return acc_df.loc[(acc_df.epoch==epoch) & 
        (acc_df.output_name=='pho') & 
        (acc_df.batch_size==batch_size) & 
        (acc_df.learning_rate==learning_rate), 'acc'].mean()

## RUN cell GLM

In [None]:
# Tidying up
df = taraban.loc[taraban['timetick'].isin(range(8, 13)) & (taraban['output_name'] == 'pho')].copy()
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'output_name', 'freq', 'reg']).mean().reset_index()
df['reg_num'] = df.reg.apply(lambda x: 0.5 if x == 'Regular' else -0.5)
df['freq_num'] = df.freq.apply(lambda x: 0.5 if x == 'High' else -0.5)

In [None]:
def get_taraban_beta(df: pd.DataFrame, batch_size:int, learning_rate:float, epoch:int, metric='acc', standardize=False) -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch)]

    y = f'zscore({metric})' if standardize else metric  # pick y

    try:
        m = smf.glm(formula=f"{y} ~ freq_num * reg_num", data=df).fit()
        p = m.params
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['epoch'] = epoch

        p['acc'] = epoch_to_mean_acc(batch_size, learning_rate, epoch)
        return pd.DataFrame(p).T
    except Exception as e:
        pass


In [None]:
epochs = list(df.epoch.unique())
batch_sizes = list(df.batch_size.unique())
learning_rates = list(df.learning_rate.unique())

def run_taraban(metric, standardize):

    bdf = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                bdf = bdf.append(get_taraban_beta(df, batch_size, learning_rate, epoch, metric, standardize))

    
    mdf = bdf.melt(id_vars=['batch_size', 'learning_rate', 'acc'], 
        value_vars=['Intercept', 'reg_num', 'freq_num', 'freq_num:reg_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='acc:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in Taraban (z:{standardize}, y:{metric}) ')


In [None]:
run_taraban('acc', True).save('Taraban_beta_zacc.html')
run_taraban('acc', False).save('Taraban_beta_acc.html')
run_taraban('sse', True).save('Taraban_beta_zsse.html')
run_taraban('sse', False).save('Taraban_beta_sse.html')

# Lexicality

In [None]:
batch_name = 'station_3'

query = """
SELECT * FROM LEXICALITY
"""

with sqlite3.connect(f"models/{batch_name}/results.db") as c:
    lex = pd.read_sql(query, con=c)


### Lexicality effect over epoch

In [None]:
df = lex.loc[(lex.timetick.isin(range(8, 13))) & (lex.output_name == 'pho')]
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'cond']).mean().reset_index()

df['lex_num'] = df.cond.apply(lambda x: 0.5 if x == 'word' else -0.5)

alt.Chart(df).mark_line(point=True).encode(
    x='epoch:Q',
    y='mean(acc):Q',
    column='learning_rate:Q',
    row='batch_size:Q',
    color='cond:N'
).save('lexicality_over_epoch.html')


### NW vs. W

In [None]:
pdf = df.pivot_table(index=['code_name', 'batch_size', 'learning_rate', 'epoch'], columns='cond', values='acc').reset_index()

In [None]:
alt.Chart(pdf).mark_point().encode(
    x=alt.X('word:Q', scale=alt.Scale(domain=[0,1])),
    y=alt.Y('nonword:Q', scale=alt.Scale(domain=[0,1])),
    column='learning_rate:Q',
    row='batch_size:Q',
    color='code_name:N'
).save('nonword_word.html')

In [None]:
def epoch_to_mean_acc(batch_size, learning_rate, epoch):
    return acc_df.loc[(acc_df.epoch==epoch) & 
        (acc_df.output_name=='pho') & 
        (acc_df.batch_size==batch_size) & 
        (acc_df.learning_rate==learning_rate), 'acc'].mean()

In [None]:
def get_lex_beta(df: pd.DataFrame, batch_size:int, learning_rate:float, epoch:int, metric='acc', standardize=False) -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch)]
    y = f'zscore({metric})' if standardize else metric  # pick y

    try:
        m = smf.glm(formula=f"{y} ~ lex_num", data=df).fit()
        p = m.params
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['epoch'] = epoch

        p['acc'] = epoch_to_mean_acc(batch_size, learning_rate, epoch)
        return pd.DataFrame(p).T
    except Exception as e:
        pass

In [None]:
epochs = list(df.epoch.unique())
batch_sizes = list(df.batch_size.unique())
learning_rates = list(df.learning_rate.unique())

def run_lex(metric, standardize):

    bdf = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                bdf = bdf.append(get_lex_beta(df, batch_size, learning_rate, epoch, metric, standardize))

    
    mdf = bdf.melt(id_vars=['batch_size', 'learning_rate', 'acc'], 
        value_vars=['Intercept', 'lex_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='acc:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in Taraban (z:{standardize}, y:{metric}) ')

In [None]:
run_lex('acc', False).save('Lexicality_beta_acc.html')
run_lex('acc', True).save('Lexicality_zbeta_acc.html')
run_lex('csse', False).save('Lexicality_beta_csse.html')
run_lex('csse', True).save('Lexicality_zbeta_csse.html')

# Imageability

In [None]:
query = """
SELECT * FROM imageability
"""

with sqlite3.connect(f"models/{batch_name}/results.db") as c:
    img = pd.read_sql(query, con=c)


In [None]:
df = img.loc[(img.timetick.isin(range(8, 13)))]
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'cond', 'output_name']).mean().reset_index()

df[["freq", "op", "img"]] = df.cond.str.split("_", expand=True)
df["fc"] = df.cond.apply(lambda x: x[:5])
df["freq_num"] = df.freq.apply(lambda x: 0.5 if x == "hf" else -0.5)
df["op_num"] = df.op.apply(lambda x: 0.5 if x == "ls" else -0.5)
df["img_num"] = df.img.apply(lambda x: 0.5 if x == "hi" else -0.5)


In [None]:
def get_img_beta(df: pd.DataFrame, output_name: str, batch_size:int, learning_rate:float, epoch:int, metric='acc', standardize=False) -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch) & (df.output_name == output_name)]
    y = f'zscore({metric})' if standardize else metric  # pick y

    try:
        m = smf.glm(formula=f"{y} ~ img_num", data=df).fit()
        p = m.params
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['epoch'] = epoch

        p['acc'] = epoch_to_mean_acc(batch_size, learning_rate, epoch)
        return pd.DataFrame(p).T
    except Exception as e:
        pass

In [None]:
epochs = list(df.epoch.unique())
batch_sizes = list(df.batch_size.unique())
learning_rates = list(df.learning_rate.unique())

def run_img(output_name, metric, standardize):

    bdf = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                bdf = bdf.append(get_img_beta(df, output_name, batch_size, learning_rate, epoch, metric, standardize))

    
    mdf = bdf.melt(id_vars=['batch_size', 'learning_rate', 'acc'], 
        value_vars=['Intercept', 'img_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='acc:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in IMG (out: {output_name} z:{standardize}, y:{metric}) ')

In [None]:
run_img(output_name='pho', metric='acc', standardize=False).save('pho_IMG_beta_acc.html')
run_img(output_name='pho', metric='acc', standardize=True).save('pho_IMG_zbeta_acc.html')
run_img(output_name='pho', metric='csse', standardize=False).save('pho_IMG_beta_csse.html')
run_img(output_name='pho', metric='csse', standardize=True).save('pho_IMG_zbeta_csse.html')

In [None]:
run_img(output_name='sem', metric='acc', standardize=False).save('SEM_IMG_beta_acc.html')
run_img(output_name='sem', metric='acc', standardize=True).save('SEM_IMG_zbeta_acc.html')
run_img(output_name='sem', metric='csse', standardize=False).save('SEM_IMG_beta_csse.html')
run_img(output_name='sem', metric='csse', standardize=True).save('SEM_IMG_zbeta_csse.html')