In [9]:
from src.learner import *
import time
import pandas as pd
from utilities import remove_cols

words = pd.read_csv('data/harm/labels.csv', header=None)
words = words[0].tolist()

# inputs and outputs
X = remove_cols(np.genfromtxt('data/harm/orth.csv', delimiter=","))
Y = remove_cols(np.genfromtxt('data/harm/phon.csv', delimiter=","))

This script examines whether or not we can obtain perfect predictions for the 8k (Harm) corpus by fitting individual models with specific configurations. We will do this before we move on to more sophisticated tuning. We are just trying to find a set that we can learn perfectly here.

Limited search across HPs...

In [None]:
seed = 387

with open('outputs/tune_1_harm.csv', 'w') as f:
    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                            "hidden_units",
                                            "learning_rate",
                                             "batch_size",
                                             "epochs",
                                             "loss_train",
                                             "accuracy_train",
                                             "mse_train",
                                             "loss_test",
                                             "accuracy_test",
                                             "mse_test",
                                             "time"))
    for learning_rate in [.075, .1, .2, .25]: 
        for batch_size in [64, 128, 256]:
            for epochs in [50, 100, 150, 250]:
                for hidden in [100, 150, 200]:

                    model = learner(X, Y, seed=seed, hidden=hidden, optimizer=Adam(learning_rate=learning_rate))    
                    
                    start_time = time.time()


                    model.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=False)

                    end_time = time.time()
                    runtime = end_time - start_time

                    loss_train, accuracy_train, mse_train = model.evaluate(X, Y, verbose=0) 
                    loss_test, accuracy_test, mse_test = model.evaluate(X, Y, verbose=0) 

                    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                                    hidden,
                                                    learning_rate,
                                                    batch_size,
                                                    epochs,
                                                    loss_train,
                                                    accuracy_train,
                                                    mse_train,
                                                    loss_test,
                                                    accuracy_test,
                                                    mse_test,
                                                    runtime))
f.close()

In [10]:
seed = 387

with open('outputs/tune_1_harm_2.csv', 'w') as f:
    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                            "hidden_units",
                                            "learning_rate",
                                             "batch_size",
                                             "epochs",
                                             "loss_train",
                                             "accuracy_train",
                                             "mse_train",
                                             "loss_test",
                                             "accuracy_test",
                                             "mse_test",
                                             "time"))
    for learning_rate in [.1, .2, .3, .4, .5, .6, .7, .8, .9]: 
        for batch_size in [256, 384, 512]:
            for epochs in [50, 100, 150, 250, 500, 100]:
                for hidden in [100, 150, 200]:

                    model = learner(X, Y, seed=seed, hidden=hidden, optimizer=Adam(learning_rate=learning_rate))    
                    
                    start_time = time.time()


                    model.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=False)

                    end_time = time.time()
                    runtime = end_time - start_time

                    loss_train, accuracy_train, mse_train = model.evaluate(X, Y, verbose=0) 
                    loss_test, accuracy_test, mse_test = model.evaluate(X, Y, verbose=0) 

                    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                                    hidden,
                                                    learning_rate,
                                                    batch_size,
                                                    epochs,
                                                    loss_train,
                                                    accuracy_train,
                                                    mse_train,
                                                    loss_test,
                                                    accuracy_test,
                                                    mse_test,
                                                    runtime))
f.close()

In [13]:
accuracies = batch_accuracy(Y, model.predict(X), dichotomous=True)
correct_words = [word for accuracy, word in zip(accuracies, words) if accuracy == True]



In [14]:
len([a for a in accuracies if a == True])

1005

In [15]:
correct_words

['aft',
 'aid',
 'aids',
 'ailed',
 'al',
 'amp',
 'amps',
 'an',
 'ant',
 'ants',
 'ash',
 'ashe',
 'ay',
 'bal',
 'bang',
 'banks',
 'bash',
 'bashed',
 'bay',
 'bayed',
 'bays',
 'beck',
 'bed',
 'beds',
 'beefs',
 'begs',
 'bel',
 'bells',
 'belt',
 'belts',
 'ben',
 'bend',
 'bent',
 'bents',
 'best',
 'bet',
 'bets',
 'bey',
 'bic',
 'bid',
 'bids',
 'biff',
 'biffed',
 'bin',
 'bing',
 'bins',
 'bit',
 'bits',
 'bix',
 'blaine',
 'bled',
 'blend',
 'blest',
 'blimp',
 'blimps',
 'blink',
 'blip',
 'blub',
 'bluff',
 'bluffs',
 'blunts',
 'blur',
 'book',
 'booked',
 'boor',
 'braid',
 'brain',
 'braise',
 'bray',
 'brayed',
 'brays',
 'bred',
 'breezed',
 'brent',
 'bret',
 'brick',
 'briefs',
 'bring',
 'brink',
 'brinks',
 'brit',
 'brunt',
 'buck',
 'bud',
 'budge',
 'buds',
 'buff',
 'buffed',
 'bug',
 'bugs',
 'bum',
 'bumps',
 'bun',
 'bund',
 'bung',
 'bunk',
 'bunks',
 'buns',
 'bunts',
 'bus',
 'busk',
 'bust',
 'busts',
 'but',
 'buts',
 'caine',
 'cal',
 'camps',
 'ca

In [17]:
with open('data/correct_words.csv', 'w') as f:
    for word in correct_words:
        f.write('{}\n'.format(word))
f.close()

The set of correct words were sorted to produce 900 child-appropriate words from the larger learnable set of `correct_words` above. These were written to file and read in below.

Only a portion of the words are learned perfectly...

In [25]:
easy_900 = pd.read_csv('data/harm/easy_900.csv', header=None)[0].tolist()
sample = np.full(len(words), False, dtype=bool)

indices = [i for i, e in enumerate(words) if e in easy_900]
# Set chosen indices to True because they select the test items not the train items
sample[indices] = True

In [30]:
model = learner(X, Y, seed=seed, hidden=100, optimizer=Adam(learning_rate=.075))    

start_time = time.time()


model.fit(X[sample], Y[sample], epochs=50, batch_size=256, verbose=True)

end_time = time.time()
runtime = end_time - start_time

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
