# Preprecess training set
This notebook preprocesses the training set from the raw data files.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from data_wrangling import (
    get_used_slots, 
    trim_unused_slots, 
    get_duplicates, 
    ort_to_binary,
    pho_to_binary,
)

Load training set raw data

In [2]:
train = pd.read_csv(
    "issues/preprocessing/6ktraining_v2.dict",
    sep="\t",
    header=None,
    names=["word", "ort", "pho", "wf"],
    na_filter=False,
)

train

Unnamed: 0,word,ort,pho,wf
0,a,____a_________,___^______,1100290.0
1,ace,____a_ce______,___es_____,117.0
2,ache,____a_che_____,___ek_____,27.0
3,ached,____a_ched____,___ekt____,3.0
4,aches,____a_ches____,___eks____,23.0
...,...,...,...,...
5827,zoo,___zoo________,__zu______,172.0
5828,zoom,___zoom_______,__zum_____,38.0
5829,zoomed,___zoomed_____,__zumd____,44.0
5830,zooms,___zooms______,__zumz____,18.0


Remove unused slots

In [3]:
train['ort'] = trim_unused_slots(train.ort)
train['pho'] = trim_unused_slots(train.pho)

We have these slots: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
Removing unused slots: [0, 11, 12, 13]
We have these slots: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Removing unused slots: []


There are some slot that are rarely used, check which ones

In [4]:
{slot: set([x[slot] for x in train.pho]) for slot in range(len(train.pho[0]))}

{0: {'_', 's'},
 1: {'S', 'T', '_', 'b', 'd', 'f', 'g', 'k', 'p', 's', 't'},
 2: {'C',
  'D',
  'J',
  'S',
  'T',
  '_',
  'b',
  'd',
  'f',
  'g',
  'h',
  'k',
  'l',
  'm',
  'n',
  'p',
  'r',
  's',
  't',
  'v',
  'w',
  'y',
  'z'},
 3: {'@', 'A', 'E', 'I', 'O', 'U', 'W', 'Y', '^', 'a', 'e', 'i', 'o', 'u'},
 4: {'C',
  'D',
  'J',
  'S',
  'T',
  'Z',
  '_',
  'b',
  'd',
  'f',
  'g',
  'k',
  'l',
  'm',
  'n',
  'p',
  'r',
  's',
  't',
  'v',
  'z'},
 5: {'C',
  'J',
  'S',
  'T',
  '_',
  'b',
  'd',
  'f',
  'g',
  'k',
  'l',
  'm',
  'n',
  'p',
  's',
  't',
  'v',
  'z'},
 6: {'J', 'T', '_', 'd', 's', 't', 'z'},
 7: {'_', 's', 't', 'z'},
 8: {'E', '_'},
 9: {'_', 's'}}

Slot 0, 8, 9 are somewhat rarely use (only one possible phoneme other than empty). They are pending for removal. Before removing, let's make sure it won't affects too many words.

In [5]:
remove_phos = []

for pho in train.pho:
    used_slots = get_used_slots(pho)
    clashed_slots = used_slots.intersection({0, 8, 9})
    if clashed_slots:
        remove_phos.append(pho)
        print(f"Overlap in {pho} at {clashed_slots}")

Overlap in _broC___Es at {8, 9}
Overlap in _kloz___Es at {8, 9}
Overlap in skr@m_____ at {0}
Overlap in skr@p_____ at {0}
Overlap in skrep_____ at {0}
Overlap in skrept____ at {0}
Overlap in skreps____ at {0}
Overlap in skr@ps____ at {0}
Overlap in skr@tC____ at {0}
Overlap in skr@tCt___ at {0}
Overlap in skral_____ at {0}
Overlap in skrald____ at {0}
Overlap in skralz____ at {0}
Overlap in skrim_____ at {0}
Overlap in skrimd____ at {0}
Overlap in skrimz____ at {0}
Overlap in skri______ at {0}
Overlap in skriC_____ at {0}
Overlap in skrid_____ at {0}
Overlap in skrin_____ at {0}
Overlap in skrinz____ at {0}
Overlap in skru______ at {0}
Overlap in skruz_____ at {0}
Overlap in skrAb_____ at {0}
Overlap in skrImp____ at {0}
Overlap in skrIp_____ at {0}
Overlap in skrIpt____ at {0}
Overlap in skrIpts___ at {0}
Overlap in skrWnJ____ at {0}
Overlap in skr^b_____ at {0}
Overlap in skr^bd____ at {0}
Overlap in skr^bz____ at {0}
Overlap in skr^f_____ at {0}
Overlap in skr^m_____ at {0}
Overlap 

Slot 8, 9 had only occupied by 2 words:
- Overlap in _broC___Es at {8, 9}
- Overlap in _kloz___Es at {8, 9}

Slot 0 had occupied by 141 words, for example:
- Overlap in skr@m_____ at {0}
- Overlap in skr@p_____ at {0}

Just to be safe, remove the words that had used slots 0, 8, 9. Then the data set will have new unused slots (in terms of the entire data set), we can use the same function `trim_unused_slots` to remove them.

In [6]:
train = train.loc[~train.pho.isin(remove_phos)].copy()
train['pho'] = trim_unused_slots(train.pho)

We have these slots: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Removing unused slots: [0, 8, 9]


Indeed, 0, 8, 9 are removed.

### Merge wordnet

In [7]:
def load_wordnet_dict(dict_file):
    """Load wordnet from source file."""

    df = pd.read_csv(
        dict_file,
        sep=r"[\t| ]",
        header=None,
        names=["word", "ort", "pho", "wf"],
        na_filter=False,  # needed to avoid treating null as empty string
        engine="python",
    )

    df["wn_idx"] = df.index

    return df

wn_dict = load_wordnet_dict("issues/preprocessing/6kdict")
wn_repr = np.genfromtxt("issues/preprocessing/wordNet_6229.csv", delimiter=",")

In [8]:
homographs = get_duplicates(wn_dict)
homographs

{'bass': 2,
 'beat': 2,
 'bet': 2,
 'cast': 2,
 'close': 2,
 'closed': 2,
 'corps': 2,
 'cut': 2,
 'deer': 2,
 'elk': 2,
 'fish': 2,
 'fit': 2,
 'fowl': 2,
 'french': 2,
 'hit': 2,
 'hops': 3,
 'hurt': 2,
 'knit': 2,
 'leaves': 2,
 'left': 2,
 'let': 2,
 'lives': 2,
 'moose': 2,
 'nuts': 2,
 'pants': 3,
 'put': 2,
 'quit': 2,
 'red': 2,
 'scours': 2,
 'set': 2,
 'shed': 2,
 'sheep': 2,
 'shrimp': 2,
 'shut': 2,
 'sioux': 2,
 'squash': 2,
 'thrust': 2,
 'was': 2,
 'wed': 2}

39 homographs is found in wordnet. We will merge it into the training set, then adjust for the duplicated word frequency.

In [9]:
train = train.merge(wn_dict[['word', 'wn_idx']], on="word", how="left")
train = train.dropna(subset=["wn_idx"])
train = train.reset_index()

Show an example to make sure the wordnet is merged correctly.

In [10]:
train.loc[train.word=='shut']

Unnamed: 0,index,word,ort,pho,wf,wn_idx
4181,4189,shut,_shu_t____,_S^t___,1316.0,4496.0
4182,4190,shut,_shu_t____,_S^t___,1316.0,4497.0


Now, proceed to adjusting word frequency to avoid over sampling from the duplicated words.

In [11]:
def adjust_wf(row):
    """Adjust word frequencies."""
    if row.word in homographs.keys():
        return(row.wf / homographs[row.word])
    else:
        return(row.wf)
        
train['adjusted_wf'] = train.apply(adjust_wf, axis=1)

Print some example to make sure it is working as expected.

In [12]:
train.loc[train.word.isin(['shut', 'man', 'swim'])]

Unnamed: 0,index,word,ort,pho,wf,wn_idx,adjusted_wf
2836,2842,man,__ma_n____,_m@n___,8417.0,3014.0,8417.0
4181,4189,shut,_shu_t____,_S^t___,1316.0,4496.0,658.0
4182,4190,shut,_shu_t____,_S^t___,1316.0,4497.0,658.0
4813,4821,swim,_swi_m____,swIm___,120.0,5265.0,120.0


Now we are ready for packaging the training set. 

In [13]:
train.to_csv('dataset/df_train_211209.csv', index=False)

## Encode training set

In [14]:
train_package = {
    "id": train.word.index.tolist(),
    "item": train.word.tolist(),
    "wf": train.adjusted_wf.tolist(),
    "ort": tf.convert_to_tensor(ort_to_binary(train.ort), dtype=tf.float32),
    "pho": tf.convert_to_tensor(pho_to_binary(train.pho), dtype=tf.float32),
    "sem": tf.convert_to_tensor(wn_repr[[int(x) for x in train.wn_idx],], dtype=tf.float32),
    "graphems": train.ort.tolist(),
    "phonemes": train.pho.tolist(),
}


Token count: defaultdict(<class 'int'>, {'_': 5641, 'c': 4, 'p': 2, 's': 32, 't': 40})
Token count: defaultdict(<class 'int'>, {'_': 3525, 'b': 185, 'c': 334, 'h': 70, 'd': 71, 'f': 166, 'g': 165, 'k': 35, 'p': 147, 'q': 49, 'r': 4, 's': 700, 't': 173, 'w': 95})
Token count: defaultdict(<class 'int'>, {'_': 202, 'b': 287, 'l': 752, 'r': 927, 'c': 283, 'h': 620, 'z': 24, 'd': 215, 'w': 290, 'f': 202, 'g': 164, 'n': 197, 'j': 89, 'k': 77, 'm': 267, 'p': 354, 's': 225, 'u': 54, 't': 356, 'v': 74, 'y': 60})
Token count: defaultdict(<class 'int'>, {'a': 1507, 'e': 1048, 'i': 1029, 'o': 1386, 'u': 691, 'y': 58})
Token count: defaultdict(<class 'int'>, {'_': 3988, 'i': 255, 'u': 231, 'w': 210, 'y': 79, 'a': 404, 'e': 332, 'h': 3, 'o': 217})
Token count: defaultdict(<class 'int'>, {'_': 206, 'c': 288, 'd': 325, 'f': 141, 'g': 249, 'l': 644, 'm': 349, 'r': 721, 's': 548, 'n': 847, 'p': 319, 't': 511, 'e': 34, 'x': 34, 'b': 156, 'k': 170, 'u': 4, 'z': 48, 'v': 115, 'q': 4, 'h': 4, 'w': 2})
Token

Verify dimensions are correct

In [15]:
print(len(train_package["id"]))
print(len(train_package["item"]))
print(len(train_package["wf"]))
print(train_package["ort"].shape)
print(train_package["pho"].shape)
print(train_package["sem"].shape)
print(len(train_package["graphems"]))
print(len(train_package["graphems"]))

5719
5719
5719
(5719, 119)
(5719, 175)
(5719, 2446)
5719
5719


Export

In [17]:
import gzip, pickle

with gzip.open("dataset/train.pkl.gz", "wb") as f:
    pickle.dump(train_package, f)

TODO: Make sparase tensor later

# Tidy source files to keys csv

In [None]:
%reload_ext lab_black
import pandas as pd
import numpy as np
import gzip
from IPython.display import clear_output

# Read training file
train_file = "issues/preprocessing/6ktraining_v2.dict"

strain_file = "issues/preprocessing/strain.txt"
strain_key_file = "issues/preprocessing/strain_key.txt"

grain_file = "issues/preprocessing/grain_nws.dict"
grain_key_file = "issues/preprocessing/grain_key.txt"

# Imageability
cortese = pd.read_csv(
    "issues/preprocessing/cortese2004norms.csv", skiprows=9, na_filter=False
)
img_map = cortese[["item", "rating"]]
img_map.columns = ["word", "img"]

# Zeno norm
zeno = pd.read_csv("issues/preprocessing/EWFG.csv", na_values="", keep_default_na=False)
zeno["gr14"] = pd.to_numeric(zeno.f, errors="coerce")  # Stage 14 is adult frequency
[zeno.pop(v) for v in ["sfi", "d", "u", "f"]]
clear_output()

# Chang training set (from Chang 2019 github)
y_wordnet = np.genfromtxt("issues/preprocessing/wordNet_6229.csv", delimiter=",")

wordnet_dict = pd.read_csv(
    "issues/preprocessing/6kdict",
    sep="\t",
    header=None,
    names=["word", "ort", "pho", "wf"],
    na_filter=False,  # Bug fix: incorrectly treated null as missing value in the corpus
)

## Copy index for key
wordnet_dict["wn_idx"] = wordnet_dict.index

## Drop wordnet duplicates (Do not drop)
## HS04: There were 39 words in which a single spelling was associated with two or more meanings (mainly words such as SHEEP, FISH, or HIT, whose plural or past tense morphological inflection involves no change from the stem).
# wordnet_dict.drop_duplicates(subset=['word'], inplace=True)

In [None]:
# Merge Zeno and IMG into train
train = pd.read_csv(
    train_file,
    sep="\t",
    header=None,
    names=["word", "ort", "pho", "wf"],
    na_filter=False,  # Bug fix: incorrectly treated null as missing value in the corpus
)

train = pd.merge(train, zeno, on="word", how="left", validate="1:1")

# Assume Zeno missing = 0
for x in range(14):
    variable_name = "gr" + str(x + 1)
    train[variable_name] = train[variable_name].map(lambda x: 0 if np.isnan(x) else x)

train = pd.merge(train, img_map, on="word", how="left", validate="1:1")


# Merge Chang
wnid = wordnet_dict.loc[:, ["word", "wn_idx"]]
train = train.merge(wnid, how="inner", on="word")
print(f"Words in training set: {len(train)}")

In [None]:
strain = pd.read_csv(
    strain_file, sep="\t", header=None, names=["word", "ort", "pho", "wf"]
)

strain_key = pd.read_table(
    strain_key_file,
    header=None,
    delim_whitespace=True,
    names=["word", "frequency", "pho_consistency", "imageability"],
)

strain = pd.merge(strain, strain_key)
strain = pd.merge(strain, img_map, on="word", how="left")
strain.sample(5)

In [None]:
strain.groupby("frequency").mean()

In [None]:
grain = pd.read_csv(
    grain_file,
    sep='\t',
    header=None,
    names=['word', 'ort', 'pho_large', 'pho_small']
)

grain_key = pd.read_table(
    grain_key_file,
    header=None,
    delim_whitespace=True,
    names=['word', 'condition']
)

grain_key['condition'] = np.where(
    grain_key['condition'] == 'critical', 'ambiguous', 'unambiguous'
)

grain = pd.merge(grain, grain_key)

grain['img'] = 0
grain['wf'] = 0
grain.sample(5)

In [None]:
taraban = pd.read_csv("issues/preprocessing/taraban.csv")
taraban.columns = ["id", "cond", "word", "ort", "pho", "wf"]
taraban = pd.merge(taraban, img_map, on="word", how="left")
taraban.sample(5)

In [None]:
glushko = pd.read_csv("issues/preprocessing/glushko_nonword.csv")
glushko.columns = ["id", "cond", "word", "pho", "ort"]

glushko["img"] = 0
glushko["wf"] = 0
glushko.sample(5)

### Check raw data integrity

In [None]:
# Check all represtation follow 14 ort, 10 pho format
assert all([len(x) == 14 for x in train.ort])
assert all([len(x) == 14 for x in strain.ort])
assert all([len(x) == 14 for x in grain.ort])
assert all([len(x) == 14 for x in taraban.ort])
assert all([len(x) == 14 for x in glushko.ort])

assert all([len(x) == 10 for x in train.pho])
assert all([len(x) == 10 for x in strain.pho])
assert all([len(x) == 10 for x in grain.pho_small])
assert all([len(x) == 10 for x in grain.pho_large])
assert all([len(x) == 10 for x in taraban.pho])

from ast import literal_eval
for pho in glushko.pho:
    ps = literal_eval(pho)
    for p in ps:
        assert len(p) == 10

# Check all fufill trim_ort criteria
locs = [0, 11, 12, 13]

for l in locs:
    assert all([x == '_' for x in train.ort.str.get(l)])
    assert all([x == '_' for x in strain.ort.str.get(l)])
    assert all([x == '_' for x in grain.ort.str.get(l)])
    assert all([x == '_' for x in taraban.ort.str.get(l)])
    assert all([x == '_' for x in glushko.ort.str.get(l)])

# No missing data in critical variables
assert sum(train.ort.isna()) == 0
assert sum(train.pho.isna()) == 0
assert sum(train.wf.isna()) == 0

assert sum(strain.ort.isna()) == 0
assert sum(strain.pho.isna()) == 0
assert sum(strain.wf.isna()) == 0

assert sum(grain.ort.isna()) == 0
assert sum(grain.pho_small.isna()) == 0
assert sum(grain.pho_large.isna()) == 0

assert sum(taraban.ort.isna()) == 0
assert sum(taraban.pho.isna()) == 0

assert sum(glushko.ort.isna()) == 0
assert sum(glushko.pho.isna()) == 0

In [None]:
def trim_ort(t):
    # The first bit and last 3 bits are empty in this source dataset (6ktraining.dict)
    t['ort'] = t.ort.apply(lambda x: x[1:11])
    return t


df_train = trim_ort(train)
df_strain = trim_ort(strain)
df_grain = trim_ort(grain)
df_taraban = trim_ort(taraban)
df_glushko = trim_ort(glushko)

# Imageability missing data replacement

In [None]:
def chk_missing(df, var):
    print(
        '{} missing in {}: {}/{}'.format(
            var, df, sum(globals()[df][var].isna()), len((globals()[df]))
        )
    )

chk_missing('df_train', 'img')
chk_missing('df_strain', 'img')
chk_missing('df_grain', 'img')
chk_missing('df_taraban', 'img')
chk_missing('df_glushko', 'img')

In [None]:
# Fill missing value to mean img rating
mean_img = df_train.img.mean()
df_train['img'] = df_train.img.fillna(mean_img)

# Fill missing value to condition mean img rating
mean_strain_hi_img = df_strain.loc[df_strain.imageability == "HI", 'img'].mean()
mean_strain_lo_img = df_strain.loc[df_strain.imageability == "LI", 'img'].mean()

df_strain.loc[df_strain.imageability == "HI",
              "img"] = df_strain.loc[df_strain.imageability == "HI",
                                     "img"].fillna(mean_strain_hi_img)

df_strain.loc[df_strain.imageability == "LI",
              "img"] = df_strain.loc[df_strain.imageability == "LI",
                                     "img"].fillna(mean_strain_lo_img)

# Since taraban do not maniputate img, just replace by training set mean
df_taraban['img'] = df_taraban.img.fillna(mean_img)

# Handle homograph

From Jay (201217):
I'm guessing they just split the real frequency in two.  If it's possible to check that, even approximately, that would be good.  (If we don't have the WJ frequencies independently of these training sets, we could just ball park it--do the frequencies in the file look comparable, or half of, the frequencies of words that are more-or-less the same frequency in some other norms.  (I'm not sure if that's clear.  If not, we can talk about it.)

- Split all frequency into n_dup

In [None]:
# Build a dictionary for looking up word:n_dup

tmp_count = train.groupby('word').agg('count').reset_index().loc[:,['word','wn_idx']]
tmp_dups = tmp_count.loc[tmp_count.wn_idx>1,]
dups_dict = dict(zip(tmp_dups.word, tmp_dups.wn_idx))



In [None]:
# Check dups in testset
strain.loc[strain.word.isin(dups_dict.keys()),]


In [None]:
taraban.loc[taraban.word.isin(dups_dict.keys()),]

In [None]:
def adjust_wf(row):
    if row.word in dups_dict.keys():
        return(row.wf / dups_dict[row.word])
    else:
        return(row.wf)
        
train['wf'] = train.apply(adjust_wf, axis=1)

In [None]:
df_train.to_csv('dataset/df_train.csv')
df_strain.to_csv('dataset/df_strain.csv')
df_grain.to_csv('dataset/df_grain.csv')
df_taraban.to_csv('dataset/df_taraban.csv')
df_glushko.to_csv('dataset/df_glushko.csv')

# Save semantics

In [None]:
# Export wordnet semantic representation (n=5821) for training set
sem_train = y_wordnet[train.wn_idx,]
print(f'Shape of selected semantic representation: {sem_train.shape}')
np.savez_compressed('dataset/sem_train.npz', data=sem_train)

In [None]:
# Export Strain semantic
strain_word_idx = [df_train.loc[df_train.word==w,].index[0] for w in strain.word]
sem_strain = sem_train[strain_word_idx,]
np.savez_compressed('dataset/sem_strain.npz', data=sem_strain)


# Encode input and output

In [None]:
# Encode orthographic representation
def ort2bin(o_col, trimMode=True, verbose=True):
    # Replicating support.py (o_char)
    # This function wrap tokenizer.texts_to_matrix to fit on multiple
    # independent slot-based input
    # i.e. one-hot encoding per each slot with independent dictionary

    from tensorflow.keras.preprocessing.text import Tokenizer

    nSlot = len(o_col[0])
    nWord = len(o_col)

    slotData = nWord * [None]
    binData = pd.DataFrame()

    for slotId in range(nSlot):
        for wordId in range(nWord):
            slotData[wordId] = o_col[wordId][slotId]

        t = Tokenizer(filters='', lower=False)
        t.fit_on_texts(slotData)
        seqData = t.texts_to_sequences(
            slotData
        )  # Maybe just use sequence data later

        # Triming first bit in each slot
        if trimMode == True:
            tmp = t.texts_to_matrix(slotData)
            thisSlotBinData = tmp[:, 1::
                                 ]  # Remove the first bit which indicate a separate slot (probably useful in recurrent network)
        elif trimMode == False:
            thisSlotBinData = t.texts_to_matrix(slotData)

        # Print dictionary details
        if verbose == True:
            print(
                'Slot {} (n = {}, unique token = {}) {} \n'.format(
                    slotId, t.document_count, len(t.word_index.items()),
                    t.word_docs
                )
            )

        # Put binary data into a dataframe
        binData = pd.concat(
            [binData, pd.DataFrame(thisSlotBinData)], axis=1, ignore_index=True
        )
        
    return binData

def ort2bin_v2(o_col):
    # Use tokenizer instead to acheive same thing, but with extra zeros columns
    # Will be useful for letter level recurrent model
    from tensorflow.keras.preprocessing.text import Tokenizer
    t = Tokenizer(filters='', lower=False, char_level=True)
    t.fit_on_texts(o_col)
    print('dictionary:', t.word_index)
    return t.texts_to_matrix(o_col)


# Merge all 3 ortho representation
all_word = pd.concat(
    [
        df_train.word, df_strain.word, df_grain.word, df_taraban.word,
        df_glushko.word
    ],
    ignore_index=True
)

all_ort = pd.concat(
    [df_train.ort, df_strain.ort, df_grain.ort, df_taraban.ort, df_glushko.ort],
    ignore_index=True
)

# Encoding orthographic representation
all_ort_bin = ort2bin(all_ort, verbose=True)

In [None]:
splitId_strain = len(df_train)
splitId_grain = splitId_strain + len(df_strain)
splitId_taraban = splitId_grain + len(df_grain)
splitId_glushko = splitId_taraban + len(df_taraban)

ort_train = np.array(all_ort_bin[0:splitId_strain])
ort_strain = np.array(all_ort_bin[splitId_strain:splitId_grain])
ort_grain = np.array(all_ort_bin[splitId_grain:splitId_taraban])
ort_taraban = np.array(all_ort_bin[splitId_taraban:splitId_glushko])
ort_glushko = np.array(all_ort_bin[splitId_glushko::])

# Save to disk
np.savez_compressed('dataset/ort_train.npz', data=ort_train)
np.savez_compressed('dataset/ort_strain.npz', data=ort_strain)
np.savez_compressed('dataset/ort_grain.npz', data=ort_grain)
np.savez_compressed('dataset/ort_taraban.npz', data=ort_taraban)
np.savez_compressed('dataset/ort_glushko.npz', data=ort_glushko)

print('==========Orthographic representation==========')
print('all shape:', all_ort_bin.shape)
print('ort_train shape:', ort_train.shape)
print('ort_strain shape:', ort_strain.shape)
print('ort_grain shape:', ort_grain.shape)
print('ort_taraban shape:', ort_taraban.shape)
print('ort_glushko shape:', ort_glushko.shape)

In [None]:
def pho2bin_v2(p_col, p_key):
    # Vectorize for performance (that no one ask for... )
    binLength = len(p_key['_'])
    nPhoChar = len(p_col[0])

    p_output = np.empty([len(p_col), binLength * nPhoChar])

    for slot in range(len(p_col[0])):
        slotSeries = p_col.str.slice(start=slot, stop=slot + 1)
        out = slotSeries.map(p_key).to_list()
        p_output[:, range(slot * 25, (slot + 1) * 25)] = out
    return p_output


from src.data_wrangling import gen_pkey
phon_key = gen_pkey()
pho_train = pho2bin_v2(train.pho, phon_key)
pho_strain = pho2bin_v2(strain.pho, phon_key)
pho_large_grain = pho2bin_v2(grain.pho_large, phon_key)
pho_small_grain = pho2bin_v2(grain.pho_small, phon_key)
pho_taraban = pho2bin_v2(taraban.pho, phon_key)

# Save to disk
np.savez_compressed('dataset/pho_train.npz', data=pho_train)
np.savez_compressed('dataset/pho_strain.npz', data=pho_strain)
np.savez_compressed('dataset/pho_large_grain.npz', data=pho_large_grain)
np.savez_compressed('dataset/pho_small_grain.npz', data=pho_small_grain)
np.savez_compressed('dataset/pho_taraban.npz', data=pho_taraban)

print('\n==========Phonological representation==========')
print(len(phon_key), ' phonemes: ', phon_key.keys())
print('pho_train shape:', pho_train.shape)
print('pho_strain shape:', pho_strain.shape)
print('pho_large_grain shape:', pho_large_grain.shape)
print('pho_small_grain shape:', pho_small_grain.shape)
print('pho_taraban shape:', pho_taraban.shape)

### Decoding check

In [None]:
from evaluate import get_all_pronunciations_fast as get_p
assert all(get_p(pho_train, phon_key) == df_train.pho)
assert all(get_p(pho_strain, phon_key) == df_strain.pho)
assert all(get_p(pho_large_grain, phon_key) == df_grain.pho_large)
assert all(get_p(pho_small_grain, phon_key) == df_grain.pho_small)
assert all(get_p(pho_taraban, phon_key) == df_taraban.pho)

## Special format for Glushko PHO (due to multiple correct answer with different length)

In [None]:
import ast, pickle

# Glushko pho dictionary
phonology_glushko = {
    x: ast.literal_eval(df_glushko.loc[i, 'pho'])
    for i, x in enumerate(df_glushko.word)
}

# Glushko one-hot encoded output dictionary
pho_glushko = {}
for k, v in phonology_glushko.items():
    ys = []
    for pho in v:
        y = []
        for char in pho:
            y += phon_key[char]
        ys.append(y)
    pho_glushko[k] = ys

with open('dataset/pho_glushko.pkl', 'wb') as f:
    pickle.dump(pho_glushko, f)

print('y_glushko dimension: {}'.format(len(pho_glushko['beed'][0])))

# Testing and evaluating new sampling probability

In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("dataset/df_train.csv", index_col=0)

# Plot sampling conversion graph
import matplotlib.pyplot as plt
import data_wrangling

plot_f = df_train.sort_values("wf")

fig, ax = plt.subplots(facecolor="w")
(line1,) = ax.plot(
    plot_f.wf,
    data_wrangling.Sampling.get_sampling_probability(plot_f, "log"),
    label="Log",
)
(line2,) = ax.plot(
    plot_f.wf,
    data_wrangling.Sampling.get_sampling_probability(plot_f, "hs04"),
    label="HS04",
)
(line3,) = ax.plot(
    plot_f.wf,
    data_wrangling.Sampling.get_sampling_probability(plot_f, "jay"),
    label="JAY",
)

ax.legend(loc="lower right")
plt.xlabel("Word frequency")
# plt.xlim((0, 200))
# plt.ylim((0, .0006))
plt.ylabel("Sampling probability")
# plt.xlim([0,100])
plt.title("Tested sampling p vs. word frequency")
plt.show()

In [None]:
fig, ax = plt.subplots(facecolor="w")
line1, = ax.plot(plot_f.wf, data_wrangling.Sampling.get_sampling_probability(plot_f, "log"), label='Log')
line2, = ax.plot(plot_f.wf, data_wrangling.Sampling.get_sampling_probability(plot_f, "hs04"), label='HS04')
line3, = ax.plot(plot_f.wf, data_wrangling.Sampling.get_sampling_probability(plot_f, "jay"), label='JAY')

ax.legend(loc='lower right')
plt.xlabel('Word frequency')
plt.ylabel('Sampling probability')
plt.xlim((0, 100))
plt.ylim((0, .0002))
plt.title('Tested sampling p vs. word frequency')
plt.show()

# Create new dictionary style representation (hash table)
each word contains a maximum of 3 representations
- orthography(ort): trimed slot-based one-hot letters encoding 
- phonology(pho): phonological features from Jason Zevin based on HS04 and Harm, 1998
- semantic(sem): based on wordnet, clone from [Chang, 19 JML](https://github.com/JasonLo/Chang_Monaghan_Welbourne_AoA_Paper_for_JML)

In [None]:
word_representation_mapping = dict(zip(train.word, train.index))

ort, pho, sem = {}, {}, {}

for word in df_train.word:
    word_idx = word_representation_mapping[word]
    ort[word] = ort_train[word_idx]
    pho[word] = pho_train[word_idx]
    sem[word] = sem_train[word_idx]

representation = {"ort": ort, "pho": pho, "sem": sem}
print(f"Total no. of training items: {len(ort.keys())}")

In [None]:
# Usage: representation["ort" or "pho" or "sem"]["word"]
print(f'Representations of "cat" are:\n')
print(f'ort: {representation["ort"]["cat"]} \n with shape = {representation["ort"]["cat"].shape}\n')
print(f'pho: {representation["pho"]["cat"]} \n with shape = {representation["pho"]["cat"].shape}\n')
print(f'sem: {representation["sem"]["cat"]} \n with shape = {representation["sem"]["cat"].shape}\n')

In [None]:
# Export to pkl.gz
import pickle, gzip

with gzip.open('dataset/representation_dictionary.pkl.gz', 'wb') as f:
    pickle.dump(representation, f)

# New test set pickle format
a dictionary with 4 keys
- item: maybe word or nonword string, easy to human eye
- ort: orthgraphic representation
- pho: phonological representation
- sem: semantic representation

If all item within training set, use data_wrangling.MyData.create_testset_from_train_idx() to create dictionary. 

Otherwise, create it manually

In [None]:
import pandas as pd
import data_wrangling
import gzip, pickle
from importlib import reload

reload(data_wrangling)
data = data_wrangling.MyData()

# Imageability
cortese = pd.read_csv(
    "issues/preprocessing/cortese2004norms.csv", skiprows=9, na_filter=False
)
img_map = cortese[["item", "rating"]]
img_map.columns = ["word", "img"]

In [None]:
# Strain (all)
strain_items = data.df_strain.word.unique()
strain_items = strain_items[strain_items != "cut"]  # Remove homographs
strain_items_idx = list(data.df_train.loc[data.df_train.word.isin(strain_items)].index)
strain_dict = data.create_testset_from_train_idx(strain_items_idx)

In [None]:
strain_159 = data.df_strain.loc[data.df_strain.word != "cut"]
strain_dict["cond"] = list(
    strain_159.frequency
    + "_"
    + strain_159.pho_consistency
    + "_"
    + strain_159.imageability
)

In [None]:
strain_dict["freq"] = list(strain_159.frequency)
strain_dict["cons"] = list(strain_159.pho_consistency)
strain_dict["img"] = list(strain_159.imageability)

In [None]:
with gzip.open("dataset/testsets/strain.pkl.gz", "wb") as f:
    pickle.dump(strain_dict, f)

In [None]:
[len(x) for x in strain_dict.values()]  # All should be the same

In [None]:
# Strain by each condition

def make_strain_sub_testsets(df, f, c, i, save_file):

    words = df.loc[
        (df.frequency == f) & (df.pho_consistency == c) & (df.imageability == i),
        "word",
    ].unique()

    idx = list(data.df_train.loc[data.df_train.word.isin(words)].index)
    testset_dict = data.create_testset_from_train_idx(idx)
    with gzip.open(f"dataset/testsets/{save_file}.pkl.gz", "wb") as f:
        pickle.dump(testset_dict, f)


make_strain_sub_testsets(
    data.df_strain, f="HF", c="CON", i="HI", save_file="strain_hf_con_hi"
)
make_strain_sub_testsets(
    data.df_strain, f="HF", c="CON", i="LI", save_file="strain_hf_con_li"
)
make_strain_sub_testsets(
    data.df_strain, f="HF", c="INC", i="HI", save_file="strain_hf_inc_hi"
)
make_strain_sub_testsets(
    data.df_strain, f="HF", c="INC", i="LI", save_file="strain_hf_inc_li"
)
make_strain_sub_testsets(
    data.df_strain, f="LF", c="CON", i="HI", save_file="strain_lf_con_hi"
)
make_strain_sub_testsets(
    data.df_strain, f="LF", c="CON", i="LI", save_file="strain_lf_con_li"
)
make_strain_sub_testsets(
    data.df_strain, f="LF", c="INC", i="HI", save_file="strain_lf_inc_hi"
)
make_strain_sub_testsets(
    data.df_strain, f="LF", c="INC", i="LI", save_file="strain_lf_inc_li"
)

In [None]:
# Train (Removed all one ort, multi sem words)
train_count = data.df_train.groupby("word").count().reset_index()
word_with_dup = list(train_count.loc[train_count.pho > 1, "word"])
train_no_dup_idx = list(data.df_train.loc[~data.df_train.word.isin(word_with_dup)].index)
train_dict = data.create_testset_from_train_idx(train_no_dup_idx)
with gzip.open("dataset/testsets/train.pkl.gz", "wb") as f:
    pickle.dump(train_dict, f)



In [None]:
import numpy as np
np.unique(train_dict['item'])

In [None]:
import os
input_path = '/home/jupyter/tf/dataset'
o = 
p_large = 
p_small = 

In [None]:
# Grain all

import numpy as np

d = {
    "item": list(data.df_grain.word),
    "cond": list(data.df_grain.condition),
    "ort": np.load(os.path.join(input_path, "ort_grain.npz"))["data"],
    "pho_large_grain": np.load(os.path.join(input_path, "pho_large_grain.npz"))["data"],
    "pho_small_grain": np.load(os.path.join(input_path, "pho_small_grain.npz"))["data"],
    "sem": np.zeros((len(data.df_grain.word), 2446))
}

with gzip.open("dataset/testsets/grain.pkl.gz", "wb") as f:
    pickle.dump(d, f)


In [None]:
# Grain ambiguous
import numpy as np

nw_amb_idx = list(data.df_grain.loc[data.df_grain.condition=="ambiguous"].index)

grain_ambiguous_dict = {
    "item": list(data.df_grain.word[nw_amb_idx]),
    "ort": data.ort_grain[nw_amb_idx],
    "pho_large_grain": data.pho_large_grain[nw_amb_idx],
    "pho_small_grain": data.pho_small_grain[nw_amb_idx],
    "sem": np.zeros((len(nw_amb_idx), 2446))
}

with gzip.open("dataset/testsets/grain_ambiguous.pkl.gz", "wb") as f:
    pickle.dump(grain_ambiguous_dict, f)


In [None]:
# Grain unambiguous
nw_un_idx = list(data.df_grain.loc[data.df_grain.condition=="unambiguous"].index)

grain_unambiguous_dict = {
    "item": list(data.df_grain.word[nw_un_idx]),
    "ort": data.ort_grain[nw_un_idx],
    "pho_large_grain": data.pho_large_grain[nw_un_idx],
    "pho_small_grain": data.pho_small_grain[nw_un_idx],
    "sem": np.zeros((len(nw_un_idx), 2446))
}

with gzip.open("dataset/testsets/grain_unambiguous.pkl.gz", "wb") as f:
    pickle.dump(grain_unambiguous_dict, f)

In [None]:
# Train img (3 groups)
img_map = cortese[["item", "rating"]]
img_map.columns = ["word", "img"]
img_map["3gp"] = pd.qcut(img_map.img, 3, ["low", "med", "high"])

In [None]:
for g in ["low", "med", "high"]:
    x = img_map.loc[img_map["3gp"]==g, "word"]
    idx = list(data.df_train.loc[data.df_train.word.isin(x)].index)
    testset = data.create_testset_from_train_idx(idx)
    
    with gzip.open(f"dataset/testsets/cortese_3gp_{g}_img.pkl.gz", "wb") as f:
        pickle.dump(testset, f)
    

In [None]:
# Low imageability (median split by Cortese rating)
low_img_cortese_word = list(img_map.loc[img_map.img < 4, "word"])
low_img_cortese_idx = list(
    data.df_train.loc[data.df_train.word.isin(low_img_cortese_word)].index
)
testset_low_img_cortest = data.create_testset_from_train_idx(low_img_cortese_idx)
with gzip.open("dataset/testsets/cortese_low_img.pkl.gz", "wb") as f:
    pickle.dump(testset_low_img_cortest, f)

# Hi imageability (median split by Cortese rating)
hi_img_cortese_word = list(img_map.loc[img_map.img >= 4, "word"])
hi_img_cortese_idx = list(
    data.df_train.loc[data.df_train.word.isin(hi_img_cortese_word)].index
)
testset_hi_img_cortest = data.create_testset_from_train_idx(hi_img_cortese_idx)
with gzip.open("dataset/testsets/cortese_hi_img.pkl.gz", "wb") as f:
    pickle.dump(testset_hi_img_cortest, f)

In [None]:
# Taraban
taraban_name_map = {
    "High-frequency exception": "HF-EXC",
    "High-frequency regular-inconsistent": "HF-REG-INC",
    "Low-frequency exception": "LF-EXC",
    "Low-frequency regular-inconsistent": "LF-REG-INC",
    "Regular control for High-frequency exception": "CTRL-HF-EXC",
    "Regular control for High-frequency regular-inconsistent": "CTRL-HF-REG-INC",
    "Regular control for Low-frequency exception": "CTRL-LF-EXC",
    "Regular control for Low-frequency regular-inconsistent": "CTRL-LF-REG-INC",
}


for c in taraban.cond.unique():
    idx = list(
        data.df_train.loc[
            data.df_train.word.isin(taraban.loc[taraban.cond == c, "word"]),
        ].index
    )
    print(idx)

    with gzip.open(
        f"dataset/testsets/taraban_{taraban_name_map[c].lower()}.pkl.gz", "wb"
    ) as f:
        pickle.dump(data.create_testset_from_train_idx(idx), f)

# Grain multi pho (add multiple answer to axis 0)

In [None]:
import data_wrangling
import numpy as np
import pickle, gzip

def convert_grain(file):
    testset = data_wrangling.load_testset(file)
    s = testset['pho_small_grain']
    l = testset['pho_large_grain']
    es = np.expand_dims(s, axis = 1)
    el = np.expand_dims(l, axis = 1)
    testset['pho'] = np.concatenate((es, el), axis=1)

    print(f"Merged large and small grain pho into multi-answer format, with dim (items, ans, pho units)={testset['pho'].shape}")
    with gzip.open(file, "wb") as f:
        pickle.dump(testset, f)

names = ('grain', 'grain_ambiguous', 'grain_unambiguous')
files = [os.path.join("/home/jupyter/tf/dataset/testsets", f"{x}.pkl.gz") for x in names]
[convert_grain(f) for f in files]



# Loading tests

In [None]:
import pickle, gzip
import pandas as pd 

testset_file = "dataset/testsets/grain_unambiguous.pkl.gz"

 
with gzip.open(testset_file, "rb") as f:
    testset = pickle.load(f)

In [None]:
testset.keys()

In [None]:
testset['pho_large_grain']

1. Get output [epoch, tick, y, item]
2. Get ans key [y, item]
3. Parallel eval is possible, cast to [epoch, tick]
4. Eval routine: acc, sse
5. Eval extra: out1, out0, other diagnostic
6. Combine

# Create 300 words test set for 3 levels of difficulties

In [None]:
import data_wrangling
import numpy as np
from typing import List


def sample_within_difficulty(low:float, high:float) -> [List[int], List[str]]:
    """Sample 100 words and returns its index within a range of difficulty levels
    args:
        low: lower bound of difficulty in percetile rank
        high: upper bound of difficult in percetile rank
    """

    data = data_wrangling.MyData()
    pct = data.df_train.wf.rank(pct=True, ascending=False)
    sel = pct.loc[(pct >= low) & (pct <= high)].sample(100)
    words = data.df_train.loc[sel.index, "word"]
    return sel.index.values, list(words)

In [None]:
import gzip, pickle

def create_testset(idx: List[int], filename:str) -> dict:
    """Create a testset for a given list of indices
    args:
        idx: list of indices
    """
    data = data_wrangling.MyData()
    testset = data.create_testset_from_train_idx(idx)
    
    # Add condition labels
    testset['cond'] = ['low'] * 100 + ['mid'] * 100 + ['hi'] * 100
    
    full_pickle_file_path = f"dataset/testsets/{filename}.pkl.gz"

    with gzip.open(full_pickle_file_path, "wb") as f:
        pickle.dump(testset, f)

    return testset



In [None]:
low_idx, low_words = sample_within_difficulty(low=0.0, high=0.2)
mid_idx, mid_words = sample_within_difficulty(low=0.4, high=0.6)
hi_idx, hi_words = sample_within_difficulty(low=0.8, high=1.0)

In [None]:
difficulty_testset_idx = np.concatenate([low_idx, mid_idx, hi_idx])
create_testset(idx=difficulty_testset_idx, filename="train_r300_difficulty")

In [None]:
import data_wrangling
import numpy as np
from typing import List

# Add difficulty label to random 100

In [None]:
r100 = data_wrangling.load_testset('train_r100')
data = data_wrangling.MyData()

In [None]:
median_wf = data.df_train.wf.median()
df = data.df_train[['word', 'wf']].copy()
df['group'] = df.wf.apply(lambda x: 'lf' if x < median_wf else 'hf')
mapper = {word: group for word, group in zip(df.word, df.group)}
r100['cond'] = [mapper[word] for word in r100['item']]

In [None]:
import gzip, pickle
full_pickle_file_path = f"dataset/testsets/{'train_r100'}.pkl.gz"

with gzip.open(full_pickle_file_path, "wb") as f:
        pickle.dump(r100, f)

# Split train testset into batch

In [None]:
import data_wrangling
import tensorflow as tf
import helper

data = data_wrangling.MyData()
train = data_wrangling.load_testset('train')
n = len(train['item'])


In [None]:


def subset_train(train:dict, begin:int, size:int) -> dict:
    """Create New Dataset by Subsetting on Train."""
    new = {}

    n_train = len(train['item'])
    assert 0 <= begin < n_train
    
    # Prevent out of range size
    if begin + size > n_train:
        size = n_train - begin
        
    for k in train.keys():
        if k == 'item':
            # 1D array
            new[k] = train[k][begin:begin+size]
        else:
            # 2D array
            dim_n = train[k].shape[1]
            new[k] = tf.slice(train[k], begin=[begin, 0], size=[size, dim_n])

    new['cond'] = None
    new['phoneme'] = helper.get_batch_pronunciations_fast(new['pho'])
    return new

batch_size = 500
for i in range((n//batch_size) + 1):
    print(f"{i}: {i*batch_size}, {min(n, (i+1)*batch_size)}")
    tmp = subset_train(train, batch_size*i, batch_size)
    data_wrangling.save_testset(tmp, f"dataset/testsets/train_batch_{i}.pkl.gz")

# Repackage homophony testset

In [None]:
import data_wrangling
import numpy as np
from random import sample
import helper

In [None]:
nh = data_wrangling.load_testset('non_homophone')
h = data_wrangling.load_testset('homophone')

# Sample 100 from each condition
h_sel_id = sample(range(len(h['item'])), 100)
nh_sel_id = sample(range(len(nh['item'])), 100)

In [None]:
homophony = {}

homophony['item'] = []
homophony['cond'] = []

for i in h_sel_id:
    homophony['item'].append(h['item'][i])
    homophony['cond'].append('homophone')

for i in nh_sel_id:
    homophony['item'].append(nh['item'][i])
    homophony['cond'].append('non_homophone')

homophony['ort'] = np.concatenate([h['ort'][h_sel_id], nh['ort'][nh_sel_id]], axis=0)
homophony['pho'] = np.concatenate([h['pho'][h_sel_id], nh['pho'][nh_sel_id]], axis=0)
homophony['sem'] = np.concatenate([h['sem'][h_sel_id], nh['sem'][nh_sel_id]], axis=0)

homophony['phoneme'] = helper.get_batch_pronunciations_fast(homophony['pho'])

data_wrangling.save_testset(homophony, 'dataset/testsets/homophony.pkl.gz')