In [None]:
%load_ext lab_black

Simulate SSE boundary which leads to ACC = 1

In [None]:
# Library from evaluate.py
import numpy as np
import pandas as pd


def gen_pkey(p_file="../common/patterns/mappingv2.txt"):
    # read phonological patterns from the mapping file
    # See Harm & Seidenberg PDF file
    mapping = pd.read_table(p_file, header=None, delim_whitespace=True)
    m_dict = mapping.set_index(0).T.to_dict("list")
    return m_dict


def get_pronunciation_fast(act, phon_key):
    phonemes = list(phon_key.keys())
    act10 = np.tile([v for k, v in phon_key.items()], 10)

    d = np.abs(act10 - act)
    d_mat = np.reshape(d, (38, 10, 25))
    sumd_mat = np.squeeze(np.sum(d_mat, 2))
    map_idx = np.argmin(sumd_mat, 0)
    out = str()
    for x in map_idx:
        out += phonemes[x]
    return out


def get_all_pronunciations_fast(act, phon_key):
    return np.apply_along_axis(get_pronunciation_fast, 1, act, phon_key)


def get_accuracy(output, target):
    return 1 * np.array(output == target)


def get_mean_accuracy(output, target):
    return np.mean(get_accuracy(output, target))


def get_sse(output, target):
    """ Get sum squared error at last axis (item level)
    """
    return np.sum(np.square(output - target), axis=-1)


def get_mean_sse(output, target):
    return np.mean(get_sse(output, target))

In [None]:
ys = get_all_pronunciations_fast(y_strain, pkey)
y_noisy = y_strain + 0.51 * np.random.rand(*y_strain.shape)
ysn = get_all_pronunciations_fast(y_noisy, pkey)
get_accuracy(ys, ysn)

### Load Strain Dataset

In [None]:
pkey = gen_pkey("../../common/patterns/mappingv2.txt")
input_path = "../../common/input/"
df_strain = pd.read_csv(input_path + "df_strain.csv", index_col=0)
y_strain = np.load(input_path + "y_strain.npz")["data"]

### Main simulation function
1. Add noise to teaching signal
2. Convert noisy signal to P with mapping.txt
3. Calculate SSE and ACC

In [None]:
def my_sim(noise_lv):
    """ This function evaluate the mean ACC and SSE of Strain dataset when noise is added to the teaching signal
    noise_lv: a multiplier for scaling standard normal Gaussian noise (np.random.rand())
    acc, see: mean acc and sse of the noisy signal
    """
    noisy_y = y_strain + noise_lv * np.random.rand(*y_strain.shape)
    clip_y = np.clip(noisy_y, 0, 1)
    yp = get_all_pronunciations_fast(clip_y, pkey)
    sse = get_mean_sse(noisy_y, y_strain)
    acc = get_mean_accuracy(yp, df_strain.pho)
    return acc, sse


def my_sim2():
    """ Check Jay's hypothesis... SSE at ZER = 2.5... catch bug in get_mean_sse, fixed
    """
    noisy_y = y_strain * 0.8 + 0.1
    yp = get_all_pronunciations_fast(noisy_y, pkey)
    sse = get_mean_sse(noisy_y, y_strain)
    acc = get_mean_accuracy(yp, df_strain.pho)
    return acc, sse


def my_sim3(noise_lv):

    # Push to almost flipping 
    uni_noisy = noise_lv * np.random.rand(*y_strain.shape)
    tipping_y = y_strain * 0.52 + 0.52 / 2
    noisy_y = tipping_y
    

Since too much noise will just wipe out accuracy, sim will keep noise level near 0.5 - 0.51, where near the boundary at each output

In [None]:
sims_acc = []
sims_sse = []
for noise in np.linspace(0.5, 0.51, 10000):
    sims = my_sim(noise)
    sims_acc.append(sims[0])
    sims_sse.append(sims[1])

In [None]:
my_sim3(1)

Plot results

In [None]:
df = pd.DataFrame()
df["acc"] = sims_acc
df["sse"] = sims_sse
df.plot.scatter(x="sse", y="acc")

### Results
- Simulated max SSE for 100% accuracy in Strain data set = 0.134

In [None]:
df.loc[df.acc == 1, "sse"].max()


In [None]:
df.epoch.unique()

# SSE by ACC

In [None]:
old_df = pd.read_csv("1250_sims.csv")

In [None]:
df = pd.read_csv("df_1M_200609.csv")
df.columns = [
    "code_name",
    "epoch",
    "hidden",
    "cleanup",
    "pnoise",
    "lr",
    "cond",
    "measure",
    "score",
]

sdf = df.loc[
    df.measure.isin(["SSE", "CorrSSE", "IncorrSSE"])
    & df.code_name.isin(old_df.ID.unique()),
]

In [None]:
import altair as alt

alt.data_transformers.disable_max_rows()

- The shift in max and min values over training is interesting--relative to the end of training, the sse or both correct and incorrect responses is much larger at the beginning of training.  
    - Same as loss... 
- I was a little surprised by how at the magnitude of SSE for correct responses at the beginning.  
- Also the overlap between the SSE distributions for correct and incorrect responses is interesting.   
- The max SSE for correct responses is always substantially higher than the min SSE for incorrect responses. 

SSE Mean shift over epoch

In [None]:
plotdf_mean = sdf.groupby(["epoch", "cond", "measure"]).score.mean().reset_index()

alt.Chart(plotdf_mean).mark_line().encode(
    x="epoch:Q", y="score", column="measure", color="cond", tooltip="score"
).properties(title="Mean SSE over epoch")

In [None]:
plotdf_mean = sdf.groupby(["epoch", "pnoise", "measure"]).score.mean().reset_index()

alt.Chart(plotdf_mean).mark_line().encode(
    x="epoch:Q", y="score", column="measure", color="pnoise:O", tooltip="score"
).properties(title="Mean SSE over epoch")

In [None]:
plotdf_mean = sdf.groupby(["epoch", "hidden", "measure"]).score.mean().reset_index()

alt.Chart(plotdf_mean).mark_line().encode(
    x="epoch:Q", y="score", column="measure", color="hidden:O", tooltip="score"
).properties(title="Mean SSE over epoch")

In [None]:
plotdf_mean = sdf.groupby(["epoch", "lr", "measure"]).score.mean().reset_index()

alt.Chart(plotdf_mean).mark_line().encode(
    x="epoch:Q", y="score", column="measure", color="lr:O", tooltip="score"
).properties(title="Mean SSE over epoch")

In [None]:
plotdf_mean = sdf.groupby(["epoch", "cleanup", "measure"]).score.mean().reset_index()

alt.Chart(plotdf_mean).mark_line().encode(
    x="epoch:Q", y="score", column="measure", color="cleanup:O", tooltip="score"
).properties(title="Mean SSE over epoch")

In [None]:
plotdf = (
    sdf.groupby(["epoch", "measure"]).score.agg(["min", "max", "mean"]).reset_index()
)
plotdf = pd.melt(
    plotdf, id_vars=["epoch", "measure"], value_vars=["min", "mean", "max"]
)

In [None]:
alt.Chart(plotdf).mark_line().encode(
    x="epoch:Q", y="value", column="measure", color="variable", tooltip="value:Q",
).properties(title="Min/Max SSE over epoch")