In [1]:
import numpy as np
import json
import pandas as pd
import time
import math
from keras import backend as K
from src.learner import *

def scale(x, K):
    return K*math.log(x)

samples = np.genfromtxt('data/samples.csv', delimiter=",").astype(bool)
holdouts = np.genfromtxt('data/holdouts.csv', delimiter=",").astype(bool)
tests = np.genfromtxt('data/tests.csv', delimiter=",").astype(bool)



## Purpose
Here we take the best and worst 20 models from the 10 and 100 hidden unit conditions and run them longitudinally, saving train/test data for every point during training. We get those learners from a saved object below.

In [3]:
target_learners = pd.read_csv('data/top_and_bottom_20_learners_10_and_100_hidden_units.csv')

In [4]:
target_learners.head()

Unnamed: 0,model_id,hidden_units
0,2287,10
1,5070,10
2,5434,10
3,2675,10
4,1305,10


In [7]:
hidden_10 = []

hidden_100 = []

for i, row in target_learners.iterrows():
    if row.hidden_units == 10:
        hidden_10.append(row.model_id)
    if row.hidden_units == 100:
        hidden_100.append(row.model_id)

Hyperparameters, used in all brute force implementations:

Hidden units: 100  
Learning rate: 0.01  
Batch size: 16  
Epochs: 50  

## Inputs, outputs, cfg

In [11]:
with open('data/params.json', 'r') as f:
    cfg = json.load(f)

X = np.genfromtxt('data/kidwords/orth-kid.csv', delimiter=",")
Y = np.genfromtxt('data/kidwords/phon-kid.csv', delimiter=",")

words = pd.read_csv('data/kidwords/kidwords.csv', header=None)[0].tolist()

Obtain frequencies for the frequency-weighting operation, just like the brute force implementation. 

In [12]:
elp = pd.read_csv('~/research/words/elp/elp_full_5.27.16.csv')

frequencies = {}

for word in words:
    rowmatch = elp[elp['Word']==word]
    if not rowmatch.empty:
        frequencies[word] = rowmatch['Freq_HAL'].values[0]+1
    else:
        frequencies[word] = 1

frequencies_ = [frequencies[word] for word in words]
weights = np.array([scale(frequency, cfg["K"]) for frequency in frequencies_])

## Top and bottom learners: 10 hidden units

In [13]:
PATH = 'outputs/top_and_bottom_20_learners/'
hidden = 10

start = time.time()

for sample in range(samples.shape[1]):
    if sample in hidden_10:
        
        model = learner(X, Y, cfg['seed'], hidden, optimizer=Adam(learning_rate=cfg['learning_rate']))

        for epoch in range(cfg['epochs']):

            pfn = 'sample_' + str(sample) + '_hidden_' + str(hidden) + '_' + str(epoch) + '_' + '_preds.csv'
            afn = 'sample_' + str(sample) + '_hidden_' + str(hidden) + '_' + str(epoch) + '_' + '_accuracies.csv'
            efn = 'sample_' + str(sample) + '_hidden_' + str(hidden) + '_' + str(epoch) + '_' + '_error.csv'

            
            model.fit(X[samples[:, sample]], Y[samples[:, sample]], epochs = 1, batch_size=cfg['batch_size'], verbose=False, sample_weight = weights[samples[:, sample]])

            loss_train, accuracy_train, mse_train = model.evaluate(X[samples[:, sample]], Y[samples[:, sample]], verbose=0) 
            loss_test, accuracy_test, mse_test = model.evaluate(X[tests[:, sample]], Y[tests[:, sample]], verbose=0) 
            loss_holdout, accuracy_holdout, mse_holdout = model.evaluate(X[holdouts[:, sample]], Y[holdouts[:, sample]], verbose=0) 

            preds = (model.predict(X) > .5).astype(int)
            np.savetxt(PATH + pfn, preds, fmt='%d', delimiter=',')

            accuracies = (preds == Y).astype(int)
            np.savetxt(PATH + afn, np.mean(accuracies, axis = 1), delimiter=',', fmt='%0.5f')

            mse = K.eval(K.mean(K.square(preds - Y), axis = 1))
            np.savetxt(PATH + efn, mse, delimiter=',', fmt='%0.5f')

    
end = time.time()
print(round(end-start, 4), "seconds elapsed")

NameError: name 'start' is not defined

## Top and bottom learners: 100 hidden units

In [15]:
PATH = 'outputs/top_and_bottom_20_learners/'
hidden = 100

start = time.time()

for sample in range(samples.shape[1]):
    if sample in hidden_100:
        
        model = learner(X, Y, cfg['seed'], hidden, optimizer=Adam(learning_rate=cfg['learning_rate']))

        for epoch in range(cfg['epochs']):

            pfn = 'sample_' + str(sample) + '_hidden_' + str(hidden) + '_' + str(epoch) + '_' + '_preds.csv'
            afn = 'sample_' + str(sample) + '_hidden_' + str(hidden) + '_' + str(epoch) + '_' + '_accuracies.csv'
            efn = 'sample_' + str(sample) + '_hidden_' + str(hidden) + '_' + str(epoch) + '_' + '_error.csv'

            
            model.fit(X[samples[:, sample]], Y[samples[:, sample]], epochs = 1, batch_size=cfg['batch_size'], verbose=False, sample_weight = weights[samples[:, sample]])

            loss_train, accuracy_train, mse_train = model.evaluate(X[samples[:, sample]], Y[samples[:, sample]], verbose=0) 
            loss_test, accuracy_test, mse_test = model.evaluate(X[tests[:, sample]], Y[tests[:, sample]], verbose=0) 
            loss_holdout, accuracy_holdout, mse_holdout = model.evaluate(X[holdouts[:, sample]], Y[holdouts[:, sample]], verbose=0) 

            preds = (model.predict(X) > .5).astype(int)
            np.savetxt(PATH + pfn, preds, fmt='%d', delimiter=',')

            accuracies = (preds == Y).astype(int)
            np.savetxt(PATH + afn, np.mean(accuracies, axis = 1), delimiter=',', fmt='%0.5f')

            mse = K.eval(K.mean(K.square(preds - Y), axis = 1))
            np.savetxt(PATH + efn, mse, delimiter=',', fmt='%0.5f')
        print(sample, "...done")
    
end = time.time()
print(round(end-start, 4), "seconds elapsed")

47 ...done
233 ...done
388 ...done
1073 ...done
1134 ...done
1221 ...done
1387 ...done
1472 ...done
2008 ...done
2089 ...done
2159 ...done
2224 ...done
2619 ...done
2700 ...done
2907 ...done
3009 ...done
3061 ...done
3378 ...done
3450 ...done
4549 ...done
4605 ...done
4782 ...done
4908 ...done
5139 ...done
5441 ...done
5686 ...done
6645 ...done
6650 ...done
6844 ...done
7032 ...done
7205 ...done
8095 ...done
8191 ...done
8316 ...done
8658 ...done
8777 ...done
9128 ...done
9175 ...done
9293 ...done
9463 ...done
1583.2011 seconds elapsed
