In [None]:
%load_ext lab_black
import pandas as pd
from random import sample

# Create head and body sound mapping with 6kdict
- Since column *ort* is vowel centered at position 5, we can use this column to split a word into its head and body
- Also column *pho* is aligned with *ort*, so that  we can map between ort and pho

Load 6k dictionary from file 

In [None]:
df = pd.read_csv(
    "../patterns/6ktraining_v2.dict",
    sep="\t",
    header=None,
    names=["word", "ort", "pho", "wf"],
    na_filter=False,
)

### Split head and body, create O to P mapping

In [None]:
df["ort_head"] = df.ort.str.slice(0, 4).str.strip("_")
df["ort_b1"] = df.ort.str.slice(4, 5).str.strip("_")
df["ort_b2"] = df.ort.str.slice(5).str.strip("_")
df["ort_body"] = df.ort_b1 + df.ort_b2
df["pho_head"] = df.pho.str.slice(0, 3).str.strip("_")
df["pho_body"] = df.pho.str.slice(3).str.strip("_")
df["map_head"] = df.ort_head + "_" + df.pho_head
df["map_body"] = df.ort_body + "_" + df.pho_body
df.sample(5)

### Create unique mapping pairs and count statistics for head mapping

In [None]:
head_map = pd.DataFrame(df.map_head.value_counts())
head_map.columns = ["n"]
head_map["map"] = head_map.index
head_map[["o", "p"]] = head_map.map.str.split("_", expand=True)
head_map.sample(5)

### Create unique mapping pairs and count statistics for body mapping

In [None]:
body_map = pd.DataFrame(df.map_body.value_counts())
body_map.columns = ["n"]
body_map["map"] = body_map.index
body_map[["o", "p"]] = body_map.map.str.split("_", expand=True, n=1)
body_map.sample(5)

# Glushko

In [None]:
gk = pd.read_csv('../patterns/glushko_raw.csv', na_filter=False)

### Since Glushko only provides words, we need to spilt head and body by looking for first vowel

In [None]:
def first_vowel_loc(word):
    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
    pos = []
    for v in vowels:
        if word.find(v) > -1:
            pos.append(word.find(v))

    if len(pos) > 0:
        return min(pos)
    else:
        return 0


def get_head(word):
    return (word[:first_vowel_loc(word)])


def get_body(word):
    return (word[first_vowel_loc(word):])

In [None]:
gk['word_head'] = list(map(get_head, gk.word))
gk['word_body'] = list(map(get_body, gk.word))
gk['pseudoword_head'] = list(map(get_head, gk.pseudoword))
gk['pseudoword_body'] = list(map(get_body, gk.pseudoword))
gk.sample(5)

### Create pronounciation using 6k dictionary mapping

In [None]:
def get_p(unit, mapping, best=False):
    """
    Get phonology from orthography
    Since head is not vary by design, please use the most frequent mapping by setting best=True
    unit: can be head or body
    mapping: must have the follow columns in pandas dataframe format:
        n: count of occurance
        o: orthography
        p: phonology
    best: only return highest count p
    """
    p = []
    maxcount = 0
    for i, h in enumerate(mapping.o):
        if h == unit:
            if mapping.p[i] != '':
                if best:
                    if mapping.n[i] > maxcount:
                        maxcount = mapping.n[i]
                        p.append(mapping.p[i])
                else:
                    p.append(mapping.p[i])
    return p

### Compile Glushko word pronounciation

For word

In [None]:
df_wpmap = df[['word', 'pho_head', 'pho_body']]
gk_compiled = gk.merge(df_wpmap, 'left', on='word')
gk_compiled.rename(
    columns={
        'pho_head': 'word_pho_head',
        'pho_body': 'word_pho_body'
    },
    inplace=True
)

Since the word "been" is missing in 6k dict, manually create pronounciation...

In [None]:
gk_compiled.loc[gk_compiled.word == 'been']

In [None]:
gk_compiled.loc[gk_compiled.word == 'been', 'word_pho_head'] = 'b'
gk_compiled.loc[gk_compiled.word == 'been', 'word_pho_body'] = 'in'

For pseudoword (i.e., nonword)

In [None]:
gk_compiled['nw_pho_head'] = [
    get_p(gk.pseudoword_head[i], head_map, best=True) for i in gk.index
]

gk_compiled['nw_pho_body'] = [
    get_p(gk.pseudoword_body[i], body_map) for i in gk.index
]

gk_compiled['word_pho_head_chk'] = [
    get_p(gk.word_head[i], head_map, best=True) for i in gk.index
]

gk_compiled['word_pho_body_chk'] = [
    get_p(gk.word_body[i], body_map) for i in gk.index
]

### Create word and nonword pronounciation

Create custom pad_merge function

In [None]:
def pad_merge(onset, body, mode):
    """
    This function convert onset and body to 6kdictionary format with padding (some what complicated vowel centered...)
    onset: head of a form
    body: body of a form
    mode: can be o (orthography) or p (phonology)
    
    This conversion logic is backward engineered from the 6kdict, with 100% correct conversion within 6kdict. 
    
    in phonology mode, first 3 bits are reserved for onset, the other 7 bits are reserved to the body
    
    in orthography, first 4 bits are reserved for onset, if the 2nd character (coda) in a body is not a vowel a padding is added between 1st and 2nd bit
    e.g., a word "thank", its onset (ort) is "th", its body is "ank". Since the n in ank is not vowel ('a', 'e', 'i', 'o', 'u', 'y', 'w')
    therefore the orthograthy is "__tha_nk______"
    Also, if the body start with 'ah', e.g. (bl-ah), despite "h" is not a vowel, no padding is inserted between 1st and 2nd bit... (apply to 3 cases in training set)
    """
    if mode == 'o':
        vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'w']
        head = onset.rjust(4, '_')
        if len(body) > 1:
            if (body[0:2] == 'ah') or (body[1] in vowels):
                tail = body.ljust(10, '_')
            else:
                tail = body[0] + '_' + body[1::].ljust(8, '_')

        else:
            tail = body.ljust(10, '_')
        out = head + tail

    if mode == 'p':
        out = onset.rjust(3, '_') + body.ljust(7, '_')
    return out

### Validate pad_merge() by all training dict samples

In [None]:
chk_pho = []
for i in df.index:
    chk_pho.append(pad_merge(df.pho_head[i], df.pho_body[i], mode='p'))

print('All pho conversion pass? {}'.format(all(chk_pho == df.pho)))

if not all(chk_pho == df.pho):
    print('Failed orthographic conversions:')
    df.loc[chk_pho != df.pho, ]

In [None]:
chk_ort = []
for i in df.index:
    chk_ort.append(pad_merge(df.ort_head[i], df.ort_body[i], mode='o'))

print('All ortho conversion pass? {}'.format(all(chk_ort == df.ort)))

if not all(chk_ort == df.ort):
    print('Failed orthographic conversions:')
    df.loc[chk_ort != df.ort, ]

### Pad and Merge nonword 

In [None]:
nwp = []
for j, pblist in enumerate(gk_compiled.nw_pho_body):
    p = []
    for b in pblist:
        p.append(pad_merge(gk_compiled.nw_pho_head[j][0], b, mode='p'))

    nwp.append(p)

gk_compiled['nw_all_p'] = nwp

In [None]:
gk_compiled[['pseudoword', 'nw_pho_head', 'nw_pho_body', 'nw_all_p']]

In [None]:
wo = []
wp = []
nwo = []
for i in gk_compiled.index:
    wo.append(
        pad_merge(gk_compiled.word_head[i], gk_compiled.word_body[i], mode='o')
    )

    wp.append(
        pad_merge(
            gk_compiled.word_pho_head[i],
            gk_compiled.word_pho_body[i],
            mode='p'
        )
    )

    nwo.append(
        pad_merge(
            gk_compiled.pseudoword_head[i],
            gk_compiled.pseudoword_body[i],
            mode='o'
        )
    )

gk_compiled['w_all_o'] = wo
gk_compiled['nw_all_o'] = nwo
gk_compiled['w_all_p'] = wp

In [None]:
gk_compiled.to_csv('../patterns/glushko_all.csv')

In [None]:
gk_nw = gk_compiled.loc[:, ['id', 'cond', 'pseudoword', 'nw_all_p', 'nw_all_o']]
gk_nw.columns = ['id', 'cond', 'nonword', 'p', 'o']
gk_nw.to_csv('../patterns/glushko_nonword.csv', index=False)