# Examine stimulus properties effects
On PHO:
- F + OP + IMG + OP x F + OP x IMG + F x IMG 

On SEM:
- IMG x F

Steps:
1. Get output at tick 12
2. Run lm on each rng_seed
    - Logistic regression for accuracy
    - Linear regression for SSE
3. Extract all betas
4. Average the betas over rng_seed
5. Plot developmental and performance space
    - add zero horizontal line
    - add epoch info
    - add sem in pho output plot, vice versa...
6. Make interactive heat if I have enough time

In [None]:
# Utilities
%load_ext google.cloud.bigquery
import sqlite3
import json
import meta
from tqdm import tqdm

# Tidy and visualize
import pandas as pd
import numpy as np
import altair as alt

# Statistics
from scipy.stats.mstats import zscore
import statsmodels.formula.api as smf
import statsmodels.api as sm

# Get PHO beta

In [None]:
# %%bigquery df
# SELECT
#   code_name, epoch, word, acc, sse 
# FROM
#   `majestic-camp-303620.station_3.train`
# WHERE
#   timetick = 12
#   AND output_name = 'pho'

In [None]:
# df.to_csv("models/station_3/pho_lasttick.csv")

In [None]:
df = pd.read_csv("models/station_3/pho_lasttick.csv", index_col=0)

In [None]:
json_file = "models/station_3/batch_config.json"

with open(json_file) as f:
    batch_cfgs = json.load(f)

all_params = [pd.DataFrame(cfg["params"]) for cfg in batch_cfgs if type(cfg["params"].values()) is not list]
cfgs = pd.concat(all_params, ignore_index=True)
cfgs = cfgs.groupby(['code_name', 'batch_size', 'learning_rate']).mean().reset_index()
cfgs = cfgs[['code_name', 'batch_size', 'learning_rate', 'rng_seed']]

In [None]:
surprisal = pd.read_csv("/home/jupyter/triangle_model/corpus/noam_surprisal.csv")
word2op_dict = {word: op for word, op in zip(surprisal.word, surprisal["uncond.surprisal"])}

df_train = pd.read_csv("/home/jupyter/triangle_model/dataset/df_train.csv")
word2wf_dict = {word: wf for word, wf in zip(df_train.word, df_train.wf)}

img_replacement_value = df_train.img[0] # Mean replacement in the dataset, get rid of it. 
word2img_dict = {word: img for word, img in zip(df_train.word, df_train.img) if not img == img_replacement_value}


def word2op(word):
    try:
        return word2op_dict[word]
    except:
        return None

def word2wf(word):
    try:
        return np.log10(word2wf_dict[word] + 1)
    except:
        return None

def word2img(word):
    try:
        return word2img_dict[word]
    except:
        return None

selected_words = set.intersection(set(word2op_dict.keys()), set(word2wf_dict.keys()), set(word2img_dict.keys()))

## Examine correlations between stimulus properties 

In [None]:
cor_df = df_train.loc[df_train.word.isin(selected_words)]
cor_df = cor_df[['word', 'wf', 'img']].copy()
cor_df['op'] = cor_df.word.apply(word2op)
cor_df['lwf'] = cor_df.wf.apply(lambda x: np.log10(x + 1))

In [None]:
len(cor_df)

In [None]:
cor_df[['lwf', 'op', 'img']].corr()

In [None]:
# calculate csse
df = df[df.word.isin(selected_words)]

df['csse'] = df.sse.loc[df.acc == 1]

# Get wf and op for each word
df['wf'] = df.word.apply(word2wf)
df['op'] = df.word.apply(word2op)
df['img'] = df.word.apply(word2img)

# Get batch size and learning rate
df = df.merge(cfgs, on=['code_name'], how='left')

# checkpoint
df.to_csv("models/station_3/parsed_pho_df.csv")

In [None]:
def get_pho_beta(df: pd.DataFrame, code_name:str, epoch:int, metric:str) -> pd.DataFrame:
    """Run one GLM and get one row of beta"""
    sdf = df.loc[(df.epoch == epoch) & (df.code_name == code_name)]
    batch_size = sdf.batch_size.unique()[0]
    learning_rate = sdf.learning_rate.unique()[0]

    assert metric in ('acc', 'sse', 'csse')
    sdf = sdf[['word', metric, 'op', 'wf', 'img']].dropna()

    try:
        rhs = "zscore(op) * zscore(wf) + zscore(op) * zscore(img) + zscore(wf) * zscore(img) + 0"

        if metric == 'acc':
            m = smf.glm(formula=f"acc ~ {rhs}", family=sm.families.Binomial(), data=sdf).fit()
        else:
            m = smf.glm(formula=f"zscore(csse) ~ {rhs}", data=sdf).fit()

        p = m.params
        p['epoch'] = epoch
        p['code_name'] = code_name
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['metric'] = metric

        return pd.DataFrame(p).T
    except Exception:
        return None


In [None]:

def make_beta_df(df, func, acc_label:str):
    """Make a dataframe of all the betas in each code_name, epoch, and metric (acc, csse)
    df: item level raw data dataframe
    func: function to get the beta for each row (e.g., get_pho_beta, get_sem_beta)
    acc_label: label for the acc column (mean accuracy at a given epoch)
    """
    
    epoch_acc_map = df.groupby(['code_name', 'epoch']).mean().reset_index()[['code_name', 'epoch', 'acc']]
    epoch_acc_map.columns = ['code_name', 'epoch', acc_label]

    code_names = sorted(df.code_name.unique())
    epochs = sorted(df.epoch.unique())
    metrics = ['acc', 'csse']

    # Do the job
    beta_df = pd.concat([func(df, code_name, epoch, metric) for code_name in tqdm(code_names) for epoch in epochs for metric in metrics], ignore_index=True)

    beta_df = beta_df.melt(id_vars=['code_name', 'epoch', 'batch_size', 'learning_rate', 'metric'], var_name='param', value_name='beta')
    beta_df = pd.merge(beta_df, epoch_acc_map, on=['code_name', 'epoch'], how='left').dropna()

    return beta_df


In [None]:
pho_beta = make_beta_df(df, get_pho_beta, acc_label='pho_acc')
pho_beta.to_csv("models/station_3/pho_beta.csv")

# Get SEM betas

In [None]:
# %%bigquery df
# SELECT
#   code_name, epoch, word, acc, sse 
# FROM
#   `majestic-camp-303620.station_3.train`
# WHERE
#   timetick = 12
#   AND output_name = 'sem'

In [None]:
df.to_csv("models/station_3/sem_lasttick.csv")

In [None]:
# calculate csse
df = df[df.word.isin(selected_words)]

df['csse'] = df.sse.loc[df.acc == 1]

# Get wf and op for each word
df['wf'] = df.word.apply(word2wf)
df['op'] = df.word.apply(word2op)
df['img'] = df.word.apply(word2img)

# Get batch size and learning rate
df = df.merge(cfgs, on=['code_name'], how='left')

# checkpoint
df.to_csv("models/station_3/parsed_sem_df.csv")

In [None]:

def get_sem_beta(df: pd.DataFrame, code_name:str, epoch:int, metric:str) -> pd.DataFrame:
    """Run one GLM and get one row of beta"""
    sdf = df.loc[(df.epoch == epoch) & (df.code_name == code_name)]
    batch_size = sdf.batch_size.unique()[0]
    learning_rate = sdf.learning_rate.unique()[0]

    assert metric in ('acc', 'sse', 'csse')
    sdf = sdf[['word', metric, 'op', 'wf', 'img']].dropna()

    try:
        rhs = "zscore(wf) * zscore(img) + 0"

        if metric == 'acc':
            m = smf.glm(formula=f"acc ~ {rhs}", family=sm.families.Binomial(), data=sdf).fit()
        else:
            m = smf.glm(formula=f"zscore(csse) ~ {rhs}", data=sdf).fit()

        p = m.params
        p['epoch'] = epoch
        p['code_name'] = code_name
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['metric'] = metric

        return pd.DataFrame(p).T
    except Exception:
        return None

In [None]:
sem_beta = make_beta_df(df, get_sem_beta)
sem_beta.to_csv("models/station_3/sem_beta.csv")

# Exchange mean accuracy between PHO and SEM

In [None]:
sem_beta = pd.read_csv("models/station_3/sem_beta.csv", index_col=0)
pho_beta = pd.read_csv("models/station_3/pho_beta.csv", index_col=0)

In [None]:
sem_acc_map = sem_beta.groupby(['code_name', 'epoch']).mean().reset_index()[['code_name', 'epoch', 'sem_acc']]
pho_acc_map = pho_beta.groupby(['code_name', 'epoch']).mean().reset_index()[['code_name', 'epoch', 'pho_acc']]

In [None]:
pho_beta = pho_beta.merge(sem_acc_map, on=['code_name', 'epoch'], how='left')
sem_beta = sem_beta.merge(pho_acc_map, on=['code_name', 'epoch'], how='left')

In [None]:
pho_beta.to_csv("models/station_3/pho_beta.csv")
sem_beta.to_csv("models/station_3/sem_beta.csv")

# Plotting

In [None]:
def plot_beta(df, x:str, metric:str, additional_acc: str):
    """Plot beta and save developmental and performance space."""
    df = df.loc[(df.metric == metric)]

    selection = alt.selection_multi(fields=['param'], bind='legend')

    # Line of betas
    b = alt.Chart().mark_line(point=True).encode(
        x=f"{x}:Q",
        y="beta:Q",
        color="param:N",
        opacity=alt.condition(selection, alt.value(1), alt.value(0.))
    ).add_selection(selection)

    # Line of additional accuracy
    a = alt.Chart().mark_line(color='black').encode(
        x=f"{x}:Q",
        y=f"mean({additional_acc}):Q",
    )
    
    # Color point to indicate 50 epoch 
    p = (
        alt.Chart()
        .transform_filter(alt.datum.epoch == 50)
        .mark_rule(color='red')
        .encode(x=f"{x}:Q")
    )

    # h-line for easier reference
    l = alt.Chart().mark_rule().encode(y='zero:Q')

    return (
        alt.layer(l, b, p, a, data=df)
        .transform_calculate(zero="0")
        .facet(row="batch_size:O", column="learning_rate:O")
        .interactive()
    ).properties(title=f"{metric}_by_{x}. Red vertical line indicate epoch == 50")
    



## Plot PHO

In [None]:
pho_beta = pd.read_csv("models/station_3/pho_beta.csv", index_col=0)
pho_beta = pho_beta.groupby(['epoch', 'batch_size', 'learning_rate', 'metric', 'param']).mean().reset_index()

In [None]:
plot_beta(pho_beta, x='epoch', metric='acc', additional_acc='sem_acc').save("models/station_3/pho_beta_dev_acc.html")
plot_beta(pho_beta, x='epoch', metric='csse', additional_acc='sem_acc').save("models/station_3/pho_beta_dev_csse.html")
plot_beta(pho_beta, x='pho_acc', metric='acc', additional_acc='sem_acc').save("models/station_3/pho_beta_per_acc.html")
plot_beta(pho_beta, x='pho_acc', metric='csse', additional_acc='sem_acc').save("models/station_3/pho_beta_per_csse.html")

## Plot SEM

In [None]:
sem_beta = pd.read_csv("models/station_3/sem_beta.csv", index_col=0)
sem_beta = sem_beta.groupby(['epoch', 'batch_size', 'learning_rate', 'metric', 'param']).mean().reset_index()

In [None]:
plot_beta(sem_beta, x='epoch', metric='acc', additional_acc='pho_acc').save("models/station_3/sem_beta_dev_acc.html")
plot_beta(sem_beta, x='epoch', metric='csse', additional_acc='pho_acc').save("models/station_3/sem_beta_dev_csse.html")
plot_beta(sem_beta, x='sem_acc', metric='acc', additional_acc='pho_acc').save("models/station_3/sem_beta_per_acc.html")
plot_beta(sem_beta, x='sem_acc', metric='csse', additional_acc='pho_acc').save("models/station_3/sem_beta_per_csse.html")

# Lexicality

In [None]:
batch_name = 'station_3'

query = """
SELECT code_name, batch_size, learning_rate, epoch, acc, sse, csse FROM LEXICALITY
WHERE testset = 'glushko' AND timetick = 12
"""

with sqlite3.connect(f"models/{batch_name}/results.db") as c:
    nonword = pd.read_sql(query, con=c)

nonword['cond'] = 'nonword'

In [None]:
# borrow setting from Glushko
settings = nonword[["code_name", "batch_size", "learning_rate"]].groupby(['code_name']).mean().reset_index()

In [None]:
word = pd.read_csv("models/station_3/pho_lasttick.csv", index_col=0)
word = word.merge(settings, on='code_name', how='left')
word['csse'] = word.sse.loc[word.acc == 1]
word = word[['code_name', 'batch_size', 'learning_rate', 'epoch', 'acc', 'sse', 'csse']]
word['cond'] = 'word'

In [None]:
df = pd.concat([word, nonword], axis=0)
del word, nonword

In [None]:
df = df.groupby(['code_name', 'batch_size', 'learning_rate', 'epoch', 'cond']).mean().reset_index()

In [None]:
# Attach mean accuracy on word by epoch
word_acc = df.loc[df.cond == 'word'].groupby(['code_name', 'epoch']).mean().reset_index()[['code_name', 'epoch', 'acc']]
word_acc.columns = ['code_name', 'epoch', 'word_acc']
df = df.merge(word_acc, on=['code_name', 'epoch'], how='left')
df['lex_num'] = df.cond.apply(lambda x: 1 if x == 'word' else 0)

## Lexicality over epoch

In [None]:
alt.Chart(df).mark_line(point=True).encode(
    x='epoch:Q',
    y='mean(acc):Q',
    column='learning_rate:Q',
    row='batch_size:Q',
    color='cond:N'
).save('lexicality_over_epoch_211012.html')

## Betas approach

In [None]:
def get_lex_beta(df: pd.DataFrame, batch_size:int, learning_rate:float, epoch:int, metric='acc', standardize=False) -> pd.DataFrame:
    # Parse the dataframe to get the parameters
    df = df.loc[(df.batch_size == batch_size) & (df.learning_rate == learning_rate) & (df.epoch == epoch)]
    y = f'zscore({metric})' if standardize else metric  # pick y

    try:
        m = smf.glm(formula=f"{y} ~ lex_num", data=df).fit()
        p = m.params
        p['batch_size'] = batch_size
        p['learning_rate'] = learning_rate
        p['epoch'] = epoch
        p['word_acc'] = df.word_acc.mean()

        return pd.DataFrame(p).T
    except Exception as e:
        pass

In [None]:
epochs = list(df.epoch.unique())
batch_sizes = list(df.batch_size.unique())
learning_rates = list(df.learning_rate.unique())

def run_lex_dev(metric, standardize):

    beta_lex = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                beta_lex = beta_lex.append(get_lex_beta(df, batch_size, learning_rate, epoch, metric, standardize))

    mdf = beta_lex.melt(id_vars=['batch_size', 'learning_rate', 'epoch'], 
        value_vars=['Intercept', 'lex_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='epoch:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in Taraban (z:{standardize}, y:{metric}) ')

def run_lex_per(metric, standardize):

    beta_lex = pd.DataFrame()

    for epoch in tqdm(epochs):
        for batch_size in batch_sizes:
            for learning_rate in learning_rates:
                beta_lex = beta_lex.append(get_lex_beta(df, batch_size, learning_rate, epoch, metric, standardize))

    mdf = beta_lex.melt(id_vars=['batch_size', 'learning_rate', 'word_acc'], 
        value_vars=['Intercept', 'lex_num'], var_name='param', value_name='beta')

    return alt.Chart(mdf).mark_line(point=True).encode(
        y='beta:Q',
        x='word_acc:Q',
        column='learning_rate:O',
        row='batch_size:O',
        color='param:N'
    ).properties(title=f'Beta in Taraban (z:{standardize}, y:{metric})')

In [None]:
run_lex_dev('acc', False).save('models/station_3/Lex_beta_dev_acc.html')
run_lex_dev('csse', True).save('models/station_3/Lex_zbeta_dev_csse.html')
run_lex_per('acc', False).save('models/station_3/Lex_beta_per_acc.html')
run_lex_per('csse', True).save('models/station_3/Lex_zbeta_per_csse.html')