In [None]:
%load_ext lab_black

Simulate SSE boundary which leads to ACC = 1

In [None]:
# Library from evaluate.py
import numpy as np
import pandas as pd


def gen_pkey(p_file="../common/patterns/mappingv2.txt"):
    # read phonological patterns from the mapping file
    # See Harm & Seidenberg PDF file
    mapping = pd.read_table(p_file, header=None, delim_whitespace=True)
    m_dict = mapping.set_index(0).T.to_dict("list")
    return m_dict


def get_pronunciation_fast(act, phon_key):
    phonemes = list(phon_key.keys())
    act10 = np.tile([v for k, v in phon_key.items()], 10)

    d = np.abs(act10 - act)
    d_mat = np.reshape(d, (38, 10, 25))
    sumd_mat = np.squeeze(np.sum(d_mat, 2))
    map_idx = np.argmin(sumd_mat, 0)
    out = str()
    for x in map_idx:
        out += phonemes[x]
    return out


def get_all_pronunciations_fast(act, phon_key):
    return np.apply_along_axis(get_pronunciation_fast, 1, act, phon_key)


def get_accuracy(output, target):
    current_word = 0
    accuracy_list = []
    target = target.tolist()
    for pronunciation in output:
        accuracy_list.append(int(pronunciation == target[current_word]))
        current_word += 1
    return np.array(accuracy_list)


def get_mean_accuracy(output, target):
    return np.mean(get_accuracy(output, target))


def get_sse(output, target):
    sse_list = []
    target = target.tolist()
    for i in range(len(output)):
        sse_list.append(np.sum(np.square(output[i] - target[i])))
    return np.array(sse_list)


def get_mean_sse(output, target):
    return np.mean(get_sse(output, target)) / len(output)

### Load Strain Dataset

In [None]:
pkey = gen_pkey("../../common/patterns/mappingv2.txt")
input_path = "../../common/input/"
df_strain = pd.read_csv(input_path + "df_strain.csv", index_col=0)
y_strain = np.load(input_path + "y_strain.npz")["data"]

### Main simulation function
1. Add noise to teaching signal
2. Convert noisy signal to P with mapping.txt
3. Calculate SSE and ACC

In [None]:
def my_sim(noise_lv):
    noisy_y = y_strain + noise_lv * np.random.rand(*y_strain.shape)
    yp = get_all_pronunciations_fast(noisy_y, pkey)
    sse = get_mean_sse(noisy_y, y_strain)
    acc = get_mean_accuracy(yp, df_strain.pho)
    return acc, sse

Since too much noise will just wipe out accuracy, sim will keep noise level near 0.5 - 0.51, where near the boundary at each output

In [None]:
sims_acc = []
sims_sse = []
for noise in np.linspace(0.5, 0.51, 10000):
    sims = my_sim(noise)
    sims_acc.append(sims[0])
    sims_sse.append(sims[1])

Plot results

In [None]:
df = pd.DataFrame()
df["acc"] = sims_acc
df["sse"] = sims_sse
df.plot.scatter(x="sse", y="acc")

### Results
- Simulated max SSE for 100% accuracy in Strain data set = 0.134

In [None]:
df.loc[df.acc == 1, "sse"].max()