In [None]:
%load_ext lab_black
import pandas as pd
import numpy as np

### Essentials libraries

In [None]:
def get_accuracy(output, target):
    return 1 * np.array(output == target)


def get_slot_sse(output, target, slot_len=25):
    """ Get slot based SSE    
    """
    segments = target.shape[-1] / slot_len
    return np.sum(
        np.array_split(np.square(output - target), segments, axis=-1), axis=-1
    )


def gen_pkey(p_file="../common/patterns/mappingv2.txt"):
    # read phonological patterns from the mapping file
    # See Harm & Seidenberg PDF file
    mapping = pd.read_table(p_file, header=None, delim_whitespace=True)
    m_dict = mapping.set_index(0).T.to_dict("list")
    return m_dict


def get_pronunciation_fast(act, phon_key):
    phonemes = list(phon_key.keys())
    act10 = np.tile([v for k, v in phon_key.items()], 10)

    d = np.abs(act10 - act)
    d_mat = np.reshape(d, (38, 10, 25))
    sumd_mat = np.squeeze(np.sum(d_mat, 2))
    map_idx = np.argmin(sumd_mat, 0)
    out = str()
    for x in map_idx:
        out += phonemes[x]
    return out


def get_all_pronunciations_fast(act, phon_key):
    return np.apply_along_axis(get_pronunciation_fast, 1, act, phon_key)


p_key = gen_pkey()
df_strain = pd.read_csv("../common/input/df_strain.csv", index_col=0)
y_strain = np.load("../common/input/y_strain.npz")["data"]
y_true = get_all_pronunciations_fast(y_strain, p_key)

### New function for getting output matrix from JZ export

In [None]:
def get_output(model_id):
    """ Fast way to read data with numpy
    """
    a = np.genfromtxt(f"sse/{model_id}_literal.txt", usecols=1)
    return a.reshape((160, 250))


# Safer way to read data with pandas with first key lookup (for checking, PASS)

# df = pd.read_csv(f"sse/{model_id}_literal.txt", delimiter="\t", header=None)
# df.columns = ["item", "output", "trash"]
# item_dict = {k: df.loc[df.item == k, "output"] for k in df.item.unique()}

# c = np.empty((160, 250))
# for i, key in enumerate(df.item.unique()):
#     c[i,] = item_dict[key]

### Parse all output from JZ

In [None]:
model_list = ["64195158", "64195283", "64195408", "64195658"]
noise_level = [0, 1, 2, 4]

all_out = pd.DataFrame()

for i, m in enumerate(model_list):

    item_eval = df_strain
    x = get_output(m)
    y_pred = get_all_pronunciations_fast(x, p_key)

    item_eval["output"] = y_pred
    item_eval["acc"] = get_accuracy(y_pred, y_true)

    # SSE related
    slot_sse = get_slot_sse(x, y_strain)
    item_eval["sse_slot1"] = slot_sse[0]
    item_eval["sse_slot2"] = slot_sse[1]
    item_eval["sse_slot3"] = slot_sse[2]
    item_eval["sse_slot4"] = slot_sse[3]
    item_eval["sse_slot5"] = slot_sse[4]
    item_eval["sse_slot6"] = slot_sse[5]
    item_eval["sse_slot7"] = slot_sse[6]
    item_eval["sse_slot8"] = slot_sse[7]
    item_eval["sse_slot9"] = slot_sse[8]
    item_eval["sse_slot10"] = slot_sse[9]
    item_eval["sse"] = slot_sse.sum(axis=0)

    item_eval["model"] = m
    item_eval["noise"] = noise_level[i]

    all_out = pd.concat([all_out, item_eval])

In [None]:
all_out.to_csv("sse_parsed.csv")