In [39]:
import pandas as pd
from keras.optimizers import SGD
from src.learner import *
import time
import random

Some tuning for the `kidwords` set of words.

In [4]:
# inputs and outputs
X = np.genfromtxt('data/kidwords/orth-kid.csv', delimiter=",")
Y = np.genfromtxt('data/kidwords/phon-kid.csv', delimiter=",")
words = pd.read_csv('data/kidwords/kidwords.csv', header=None)[0].tolist()

For tuning we will use a random sample of the same size that our samples will eventually be. This involves allocating 600 words for test, and the rest for train - but not using our pre-allocated samples for the purpose. This is the same procedure as other tuning efforts here.

In [5]:
np.random.seed(982)

target_train_size = 300

train_n = X.shape[0]
test_n = train_n-target_train_size

sample = np.full(train_n, True, dtype=bool)

indices = np.random.choice(train_n, test_n, replace=False)

# Set chosen indices to True because they select the test items not the train items
sample[indices] = False

Search across HPs, all using SGD...

In [23]:
seed = 387
random.seed(seed)

In [25]:
with open('outputs/tune_kidwords_2_sgd.csv', 'w') as f:
    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                            "hidden_units",
                                            "learning_rate",
                                             "batch_size",
                                             "epochs",
                                             "loss_train",
                                             "accuracy_train",
                                             "mse_train",
                                             "loss_test",
                                             "accuracy_test",
                                             "mse_test",
                                             "time"))
    for learning_rate in [.01, .025, .05, .075, .1, .15, .2, .25]: 
        for batch_size in [16, 32, 64, 96, 128, 256]:
            for epochs in [400, 450, 500, 550, 600]:
                for hidden in [100]:

                    optimizer = SGD(learning_rate=learning_rate)

                    model = learner(X, Y, seed, hidden, optimizer=optimizer)
                    
                    start_time = time.time()

                    model.fit(X[sample], Y[sample], epochs=epochs, batch_size=batch_size, verbose=False)

                    end_time = time.time()
                    runtime = end_time - start_time

                    loss_train, accuracy_train, mse_train = model.evaluate(X[sample], Y[sample], verbose=0) 
                    loss_test, accuracy_test, mse_test = model.evaluate(X[~sample], Y[~sample], verbose=0) 

                    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                                    hidden,
                                                    learning_rate,
                                                    batch_size,
                                                    epochs,
                                                    loss_train,
                                                    accuracy_train,
                                                    mse_train,
                                                    loss_test,
                                                    accuracy_test,
                                                    mse_test,
                                                    runtime))
f.close()

Let's look at a specific set of hyperparameters for reference:

In [40]:
hidden = 100
learning_rate = 0.25

In [99]:
optimizer = SGD(learning_rate=learning_rate)
model = learner(X, Y, seed, hidden, optimizer=optimizer)                    
model.fit(X[sample], Y[sample], epochs=140, batch_size=256, verbose=True)

Epoch 1/140
Epoch 2/140
Epoch 3/140
Epoch 4/140
Epoch 5/140
Epoch 6/140
Epoch 7/140
Epoch 8/140
Epoch 9/140
Epoch 10/140
Epoch 11/140
Epoch 12/140
Epoch 13/140
Epoch 14/140
Epoch 15/140
Epoch 16/140
Epoch 17/140
Epoch 18/140
Epoch 19/140
Epoch 20/140
Epoch 21/140
Epoch 22/140
Epoch 23/140
Epoch 24/140
Epoch 25/140
Epoch 26/140
Epoch 27/140
Epoch 28/140
Epoch 29/140
Epoch 30/140
Epoch 31/140
Epoch 32/140
Epoch 33/140
Epoch 34/140
Epoch 35/140
Epoch 36/140
Epoch 37/140
Epoch 38/140
Epoch 39/140
Epoch 40/140
Epoch 41/140
Epoch 42/140
Epoch 43/140
Epoch 44/140
Epoch 45/140
Epoch 46/140
Epoch 47/140
Epoch 48/140
Epoch 49/140
Epoch 50/140
Epoch 51/140
Epoch 52/140
Epoch 53/140
Epoch 54/140
Epoch 55/140
Epoch 56/140
Epoch 57/140
Epoch 58/140
Epoch 59/140
Epoch 60/140
Epoch 61/140
Epoch 62/140
Epoch 63/140
Epoch 64/140
Epoch 65/140
Epoch 66/140
Epoch 67/140
Epoch 68/140
Epoch 69/140
Epoch 70/140
Epoch 71/140
Epoch 72/140
Epoch 73/140
Epoch 74/140
Epoch 75/140
Epoch 76/140
Epoch 77/140
Epoch 78

<keras.callbacks.History at 0x7f47f3bd67d0>

Iterate through random samples and train/ save in longruns...

In [100]:
model.fit(X[sample], Y[sample], epochs=70, batch_size=1, verbose=True)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x7f48f4354af0>

In [103]:
hidden = 100
learning_rate = 0.25
epochs = 210


for s in range(seed, seed+20): # manipulate random seed by one starting with seed from above 

    random.seed(s)
    target_train_size = 300

    train_n = X.shape[0]
    test_n = train_n-target_train_size

    sample = np.full(train_n, True, dtype=bool)

    indices = np.random.choice(train_n, test_n, replace=False)

    # Set chosen indices to True because they select the test items not the train items
    sample[indices] = False

    train_indices = np.where(sample)[0]
    test_indices = np.where(~sample)[0]    

    split = []

    for i, word in enumerate(words):
        if i in train_indices:
            split.append('train')
        elif i in test_indices:
            split.append('test')

    model = learner(X, Y, seed=s, hidden=hidden, optimizer=SGD(learning_rate=learning_rate))

    # accuracy at epoch 0
    epoch = 0
    preds = model.predict(X, batch_size=batch_size) 
    preds_binary = (preds > 0.5).astype(int)
    accuracies = (preds_binary == Y).astype(int)

    fname = 'outputs/kidwords_2_sgd_longitudinal/tune_kidwords_2_sgd_longitudinal_seed_' + str(s) + '_epoch_' + str(epoch) + '.csv'

    df = pd.DataFrame(accuracies.tolist())
    df.columns = [colname + 1 for colname in df.columns]
    df['words'] = words
    df['train_test'] = split
    df.to_csv(fname, index=False)

    for epoch in range(epochs):

        if epoch < 140:
            batch_size = 256
        else:
            batch_size = 1
        
        fname = 'outputs/kidwords_2_sgd_longitudinal/tune_kidwords_2_sgd_longitudinal_seed_' + str(s) + '_epoch_' + str(epoch) + '.csv'

        model.fit(X[sample], Y[sample], epochs=1, batch_size=batch_size, verbose=True)

        preds = model.predict(X, batch_size=batch_size) 
        preds_binary = (preds > 0.5).astype(int)
        accuracies = (preds_binary == Y).astype(int)

        df = pd.DataFrame(accuracies.tolist())
        df.columns = [colname + 1 for colname in df.columns]
        df['words'] = words
        df['train_test'] = split
        df.to_csv(fname, index=False)

f.close()



In [33]:
from keras.optimizers import SGD
model = learner(X, Y, seed=s, hidden=hidden, optimizer=SGD())

model.fit(X[sample], Y[sample], epochs=400, batch_size=batch_size, verbose=True)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

<keras.callbacks.History at 0x7fbb671241c0>

In [None]:
df

In [None]:
accuracy = batch_accuracy(Y, model.predict(X), dichotomous=False)

In [None]:
sum([1 for acc in accuracy if acc == True])

In [None]:
preds = (model.predict(X) > .5).astype(int)

In [None]:
np.savetxt('outputs/tune_3k_1a_prediction_accuracies_sample_run.csv', (preds == Y).astype(int), delimiter = ',')

Now let's take this configuration and run over a set of random seeds and generate some longitudinal data to examine. 