# Create MN train r100 testset
Mikenet is using a slightly different testset, we need to wrap it up in our usual format the reuse our code base for testing.

In [1]:
import os
import data_wrangling
import numpy as np
os.chdir("../")

# Parser functions

In [10]:
def get_pattern(lines: int, line_start: int, line_end: int) -> list:
    """Get the pattern (representation) vector from lines."""
    pattern = []
    for i in range(line_start, line_end):
        line = lines[i].strip().split(" ")
        [pattern.append(int(unit)) for unit in line if unit not in ['', ',']]
    return pattern


def parse_pattern(file: str):
    items = list()
    ort = list()
    pho = list()
    ort_pattern = {}
    pho_pattern = {}
    sparse_sem_pattern = {}

    with open(file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip()
            if line.startswith("TAG Word: "):
                content = line.split(" ")
                word = content[2]
                ort.append(content[6])  # human readable ort
                pho.append(content[4])  # human readable pho
                items.append(word)  # Record item
                print(f"word:{word}, ort:{ort[-1]}, pho:{pho[-1]}")

            if line.startswith("CLAMP Ortho"):
                # Take the next 14 lines value into ort representation dictionary
                ort_pattern[word] = get_pattern(lines, i + 1, i + 15)

            if line.startswith("TARGET Phono"):
                # Take the next 8 lines value into pho representation dictionary
                pho_pattern[word] = get_pattern(lines, i + 1, i + 9)

            if line.startswith("TARGET Semantics"):
                # Take the next 1 line value into sparse representation dictionary
                sparse_sem_pattern[word] = get_pattern(lines, i + 1, i + 2)

    return items, ort, pho, ort_pattern, pho_pattern, sparse_sem_pattern


def sparse_to_dense(representation, units=2446) -> np.array:
    """Convert sparse (semantic) representation to dense representation."""
    dense = np.zeros(units)
    for unit in representation:
        dense[int(unit)] = 1
    return dense


# Parse MikeNet pattern file

In [11]:
items, ort, pho, ort_pattern, pho_pattern, sparse_sem_pattern = parse_pattern(
    "mikenet/englishdict_randcon.pat.txt"
)


word:a, ort:____a_________, pho:___^______
word:ace, ort:____a_ce______, pho:___es_____
word:ache, ort:____a_che_____, pho:___ek_____
word:ached, ort:____a_ched____, pho:___ekt____
word:aches, ort:____a_ches____, pho:___eks____
word:act, ort:____a_ct______, pho:___@kt____
word:acts, ort:____a_cts_____, pho:___@kts___
word:ad, ort:____a_d_______, pho:___@d_____
word:add, ort:____a_dd______, pho:___@d_____
word:adds, ort:____a_dds_____, pho:___@dz____
word:ads, ort:____a_ds______, pho:___@dz____
word:adze, ort:____a_dze_____, pho:___@dz____
word:aft, ort:____a_ft______, pho:___@ft____
word:age, ort:____a_ge______, pho:___eJ_____
word:aid, ort:____aid_______, pho:___ed_____
word:aide, ort:____aide______, pho:___ed_____
word:aides, ort:____aides_____, pho:___edz____
word:aids, ort:____aids______, pho:___edz____
word:ail, ort:____ail_______, pho:___el_____
word:ailed, ort:____ailed_____, pho:___eld____
word:ails, ort:____ails______, pho:___elz____
word:aim, ort:____aim_______, pho:___em____

## checking if it make sense

In [5]:
def dense_to_sparse(dense: np.array) -> list:
    """Convert dense representation to sparse representation."""
    sparse = []
    for i, unit in enumerate(dense):
        if unit == 1:
            sparse.append(i)
    return sparse

def check_pattern(word: str):
    print(f"ORT on nodes: {dense_to_sparse(ort_pattern[word])}")
    print(f"PHO on nodes: {dense_to_sparse(pho_pattern[word])}")
    print(f"SEM on nodes: {sparse_sem_pattern[word]}")

check_pattern("close")

ORT on nodes: [54, 89, 118, 174, 186]
PHO on nodes: [29, 31, 52, 60, 62, 87, 90, 92, 94, 96, 102, 107]
SEM on nodes: [813, 46, 29, 913, 236, 209, 49, 4, 6, 1148, 104]


# Pack into my TF data format
IMPORTANT: the order of the items must be the same as the order of the inputs
I did not account for multi meaning items

In [16]:
train_r100 = data_wrangling.load_testset("train_r100")

mn_r100 = {}
mn_r100["item"] = train_r100["item"]
phoneme_dict = {word: phoneme for word, phoneme in zip(items, pho)}

np_ort = np.zeros(shape=(100, 364))
np_pho = np.zeros(shape=(100, 200))
np_sem = np.zeros(shape=(100, 2446))
phoneme = []

for idx, item in enumerate(train_r100["item"]):
    np_ort[idx, :] = np.array(ort_pattern[item])
    np_pho[idx, :] = np.array(pho_pattern[item])
    np_sem[idx, :] = sparse_to_dense(sparse_sem_pattern[item], 2446)
    phoneme.append(phoneme_dict[item])

mn_r100["ort"] = np_ort
mn_r100["pho"] = np_pho
mn_r100["sem"] = np_sem
mn_r100["phoneme"] = phoneme
mn_r100["cond"] = None

data_wrangling.save_testset(mn_r100, "dataset/testsets/mn_r100.pkl.gz")