# Tidy source files to keys csv

In [11]:
%load_ext lab_black
import pandas as pd
import numpy as np

# Read training file
train_file = '../common/patterns/6ktraining.dict'

strain_file = '../common/patterns/strain.txt'
strain_key_file = '../common/patterns/strain_key.txt'

grain_file = '../common/patterns/grain_nws.dict'
grain_key_file = '../common/patterns/grain_key.txt'

# Imageability
cortese = pd.read_csv('../common/patterns/cortese2004norms.csv', skiprows=9)
img_map = cortese[['item', 'rating']]
img_map.columns = ['word', 'img']

train = pd.read_csv(
    train_file,
    sep='\t',
    header=None,
    names=['word', 'ort', 'pho', 'wf'],
    na_filter=False    # Bug fix: incorrectly treated null as missing value in the corpus
)

In [12]:
df_train = pd.merge(train, img_map, on='word', how='left')

strain = pd.read_csv(
    strain_file, sep='\t', header=None, names=['word', 'ort', 'pho', 'wf']
)

strain_key = pd.read_table(
    strain_key_file,
    header=None,
    delim_whitespace=True,
    names=['word', 'frequency', 'pho_consistency', 'imageability']
)

df_strain = pd.merge(strain, strain_key)
df_strain = pd.merge(df_strain, img_map, on='word', how='left')

grain = pd.read_csv(
    grain_file,
    sep='\t',
    header=None,
    names=['word', 'ort', 'pho_large', 'pho_small']
)

grain_key = pd.read_table(
    grain_key_file,
    header=None,
    delim_whitespace=True,
    names=['word', 'condition']
)

grain_key['condition'] = np.where(
    grain_key['condition'] == 'critical', 'ambiguous', 'unambiguous'
)

df_grain = pd.merge(grain, grain_key)

df_grain['img'] = 0
df_grain['wf'] = 0


def prepDF(t):
    # The first bit and last 3 bits are empty in this source dataset (6ktraining.dict)
    t['ort'] = t.ort.apply(lambda x: x[1:11])
    return t


df_train = prepDF(df_train)
df_strain = prepDF(df_strain)
df_grain = prepDF(df_grain)

# Fill missing value to mean img rating
mean_img = df_train.img.mean()
df_train = df_train.fillna(mean_img)
df_strain = df_strain.fillna(mean_img)

df_train.to_csv('../common/input/df_train.csv')
df_strain.to_csv('../common/input/df_strain.csv')
df_grain.to_csv('../common/input/df_grain.csv')

print(df_train.head)
print(df_strain.head)
print(df_grain.head)

<bound method NDFrame.head of         word         ort         pho         wf       img
0          a  ___a______  ___^______  1100290.0  4.199444
1        ace  ___a_ce___  ___es_____      117.0  4.500000
2       ache  ___a_che__  ___ek_____       27.0  3.800000
3      ached  ___a_ched_  ___ekt____        3.0  4.199444
4      aches  ___a_ches_  ___eks____       23.0  4.199444
...      ...         ...         ...        ...       ...
5865     zoo  __zoo_____  __zu______      172.0  6.800000
5866    zoom  __zoom____  __zum_____       38.0  3.500000
5867  zoomed  __zoomed__  __zumd____       44.0  4.199444
5868   zooms  __zooms___  __zumz____       18.0  4.199444
5869    zoos  __zoos____  __zuz_____       23.0  4.199444

[5870 rows x 5 columns]>
<bound method NDFrame.head of       word         ort         pho     wf frequency pho_consistency  \
0     ball  __ba_ll___  __bal_____   1393        HF             INC   
1     bank  __ba_nk___  __b@nk____  53170        HF             CON   
2    

# Encode input and output

In [None]:
# Encode orthographic representation
def ort2bin(o_col, trimMode=True, verbose=True):
    # Replicating support.py (o_char)
    # This function wrap tokenizer.texts_to_matrix to fit on multiple
    # independent slot-based input
    # i.e. one-hot encoding per each slot with independent dictionary

    from tensorflow.keras.preprocessing.text import Tokenizer

    nSlot = len(o_col[0])
    nWord = len(o_col)

    slotData = nWord * [None]
    binData = pd.DataFrame()

    for slotId in range(nSlot):
        for wordId in range(nWord):
            slotData[wordId] = o_col[wordId][slotId]

        t = Tokenizer(filters='', lower=False)
        t.fit_on_texts(slotData)
        seqData = t.texts_to_sequences(
            slotData
        )  # Maybe just use sequence data later

        # Triming first bit in each slot
        if trimMode == True:
            tmp = t.texts_to_matrix(slotData)
            thisSlotBinData = tmp[:, 1::
                                 ]  # Remove the first bit which indicate a separate slot (probably useful in recurrent network)
        elif trimMode == False:
            thisSlotBinData = t.texts_to_matrix(slotData)

        # Print dictionary details
        if verbose == True:
            print('In slot ', slotId, '\t')
            print('token count:', t.word_counts)
            print('word count:', t.document_count)
            print('dictionary:', t.word_index)
            print('token appear in how many words:', t.word_docs)

        # Put binary data into a dataframe
        binData = pd.concat(
            [binData, pd.DataFrame(thisSlotBinData)], axis=1, ignore_index=True
        )

    return binData


def ort2bin_v2(o_col):
    # Use tokenizer instead to acheive same thing, but with extra zeros columns
    # Will be useful for letter level recurrent model
    from tensorflow.keras.preprocessing.text import Tokenizer
    t = Tokenizer(filters='', lower=False, char_level=True)
    t.fit_on_texts(o_col)
    print('dictionary:', t.word_index)
    return t.texts_to_matrix(o_col)


# Merge all 3 ortho representation
all_ort = pd.concat(
    [df_train.ort, df_strain.ort, df_grain.ort], ignore_index=True
)

# Encoding orthographic representation
all_ort_bin = ort2bin(all_ort, verbose=False)
splitId_strain = len(df_train)
splitId_grain = len(df_train) + len(df_strain)

x_train = np.array(all_ort_bin[0:splitId_strain])
x_strain = np.array(all_ort_bin[splitId_strain:splitId_grain])
x_grain = np.array(all_ort_bin[splitId_grain::])

# Save to disk
np.savez_compressed('../common/input/x_train.npz', data=x_train)
np.savez_compressed('../common/input/x_strain.npz', data=x_strain)
np.savez_compressed('../common/input/x_grain.npz', data=x_grain)

print('==========Orthographic representation==========')
print('all shape:', all_ort_bin.shape)
print('x_train shape:', x_train.shape)
print('x_strain shape:', x_strain.shape)
print('x_grain shape:', x_grain.shape)


def pho2bin_v2(p_col, p_key):
    # Vectorize for performance (that no one ask for... )
    binLength = len(p_key['_'])
    n = len(p_col)
    nPhoChar = len(p_col[0])

    p_output = np.empty([n, binLength * nPhoChar])

    for slot in range(len(p_col[0])):
        slotSeries = p_col.str.slice(start=slot, stop=slot + 1)
        outSeries = slotSeries.map(p_key)
        p_output[:, range(slot * 25, (slot + 1) * 25)] = outSeries.to_list()
    return p_output


phon_key = gen_pkey()
y_train = pho2bin_v2(train.pho, phon_key)
y_strain = pho2bin_v2(strain.pho, phon_key)
y_large_grain = pho2bin_v2(grain.pho_large, phon_key)
y_small_grain = pho2bin_v2(grain.pho_small, phon_key)

# Save to disk
np.savez_compressed('../common/input/y_train.npz', data=y_train)
np.savez_compressed('../common/input/y_strain.npz', data=y_strain)
np.savez_compressed('../common/input/y_large_grain.npz', data=y_large_grain)
np.savez_compressed('../common/input/y_small_grain.npz', data=y_small_grain)

print('\n==========Phonological representation==========')
print(len(phon_key), ' phonemes: ', phon_key.keys())
print('y_train shape:', y_train.shape)
print('y_strain shape:', y_strain.shape)
print('y_large_grain shape:', y_large_grain.shape)
print('y_small_grain shape:', y_small_grain.shape)

# Testing and evaluating new sampling probability

In [None]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('../common/input/df_train.csv', index_col=0)

# Plot sampling conversion graph
import matplotlib.pyplot as plt
from data_wrangling import wf_manager

plot_f = df_train.sort_values('wf')
sortwf = wf_manager(plot_f['wf'])

fig, ax = plt.subplots(facecolor="w")
line1, = ax.plot(sortwf.wf, sortwf.samp_log(), label='HS04')
line2, = ax.plot(sortwf.wf, sortwf.samp_hs04(), label='JAY')
line3, = ax.plot(sortwf.wf, sortwf.samp_jay(), label='LOG')
ax.legend(loc='lower right')
plt.xlabel('Word frequency')
plt.ylabel('Sampling probability')
# plt.xlim([0,100])
plt.title('Tested sampling p vs. word frequency')
plt.show()