## Overview
We will tune and pilot with the 500 most frequent monosyllabic words from TASA.

In [3]:
import numpy as np
import json
import pandas as pd
import time
import tensorflow as tf

import matplotlib.pyplot as plt
from keras import backend as K


from src.learner import *
from utilities import *

# data
kidwords = pd.read_csv('data/kidwords/kidwords.csv', header=None)[0].tolist()

top_500 = pd.read_csv('data/top_500.csv')

words, X, Y = subset_kidwords(top_500.word.tolist(), kidwords, np.genfromtxt('data/kidwords/orth.csv', delimiter=","), np.genfromtxt('data/kidwords/phon.csv', delimiter=","), remove_null_columns=True)

GPU check

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


## Tune

In [5]:
seed = 323


from keras.optimizers import Adam
with open('outputs/tune_top_500_v1.csv', 'w') as f:
    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                            "hidden_units",
                                            "learning_rate",
                                             "batch_size",
                                             "epochs",
                                             "loss_train",
                                             "accuracy_train",
                                             "mse_train",
                                             "loss_test",
                                             "accuracy_test",
                                             "mse_test",
                                             "time"))
    for learning_rate in [.001, .005, .01, .025, None]: 
        for batch_size in [10, 20, 30, 40, 50]:
            for epochs in [20, 40, 60]:
                for hidden in [8, 12, 16, 20]:
                    
                    print("Configuration currently training:", learning_rate, batch_size, epochs, hidden)

                    if learning_rate is not None:
                        optimizer = Adam(learning_rate=learning_rate)
                    if learning_rate is None:
                        optimzer = None

                    model = learner(X, Y, seed, hidden, optimizer=None)
                    
                    start_time = time.time()

                    model.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=True)

                    end_time = time.time()
                    runtime = end_time - start_time

                    loss_train, accuracy_train, mse_train = model.evaluate(X, Y, verbose=0) 

                    f.write("{},{},{},{},{},{},{},{}\n".format(
                                                    hidden,
                                                    learning_rate,
                                                    batch_size,
                                                    epochs,
                                                    loss_train,
                                                    accuracy_train,
                                                    mse_train,
                                                    runtime))
f.close()

Configuration currently training: 0.001 10 20 8
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


IndexError: Replacement index 8 out of range for positional args tuple

In [15]:
def calculate_error(model, X, Y):
    predictions = model.predict(X)
    mse = np.mean((predictions - Y) ** 2, axis=1)
    return mse

In [20]:
frequencies = {}

for word in words:
    rowmatch = top_500[top_500['word']==word]
    if not rowmatch.empty:
        frequencies[word] = rowmatch['frequency'].values[0]+1
    # all frequencies should be present, but just in case...
    else:
        frequencies[word] = 1


In [17]:
def scale_frequencies(frequencies):
    """
    Scales the given word frequencies between 0 and 1.
    
    Parameters:
    frequencies (dict): A dictionary where keys are words and values are their frequencies.
    
    Returns:
    dict: A dictionary with scaled frequencies.
    """
    max_freq = max(frequencies.values())
    scaled_frequencies = {word: freq / max_freq for word, freq in frequencies.items()}
    return scaled_frequencies

In [25]:
scaled_frequencies = scale_frequencies(frequencies)
frequency_weights = np.array([scaled_frequencies[word] for word in words])

# Training loop

In [31]:
mse[sorted_indices][0:30]

array([0.09044007, 0.09044013, 0.09043867, 0.09042422, 0.0904216 ,
       0.09042036, 0.09042026, 0.09046778, 0.09046796, 0.09040911,
       0.09040576, 0.09040471, 0.09040409, 0.09040304, 0.09040214,
       0.09040205, 0.09039956, 0.09039343, 0.09038391, 0.09038004,
       0.09037878, 0.09037737, 0.09037563, 0.0903739 , 0.09037122,
       0.09037036, 0.09036517, 0.09036351, 0.09036239, 0.09035881])

In [28]:
# Training loop
epochs = 500  # Set the number of epochs
N = 20  # Number of items to select
seed = 349
model = learner(X, Y, seed, hidden=15, optimizer=Adam(learning_rate=.001))


for epoch in range(epochs):
    mse = calculate_error(model, X, Y)
    median_mse = np.median(mse)
    
    # Find indices of the 20 items around the median MSE
    sorted_indices = np.argsort(np.abs(mse - median_mse))
    selected_indices = sorted_indices[:N]
    
    x = X[selected_indices]
    y = Y[selected_indices]
    
    # Train the model on the selected items
    model.fit(x, y, epochs=1, verbose=1)

