## Overview
We will tune and pilot with the 500 most frequent monosyllabic words from TASA.

In [35]:
# python: base (3.11.4)

import numpy as np
import json
import pandas as pd
import time
import tensorflow as tf

from tensorflow.keras import backend as K


from src.learner import *
from utilities import *


# data
kidwords = pd.read_csv('data/kidwords/kidwords.csv', header=None)[0].tolist()

top_500 = pd.read_csv('data/top_500.csv')
bottom_500 = pd.read_csv('data/infrequent_500.csv')
train_word_indices = np.array([i for i, e in enumerate(kidwords) if e in top_500['word'].values])
control_word_indices = np.array([i for i, e in enumerate(kidwords) if e in bottom_500['word'].values])

XX = np.genfromtxt('data/kidwords/orth.csv', delimiter=",")
YY = np.genfromtxt('data/kidwords/phon.csv', delimiter=",")

non_zero_a = np.any(XX != 0, axis=0)
X = XX[:, non_zero_a]

non_zero_b = np.any(YY != 0, axis=0)
Y = YY[:, non_zero_b]

# configs
with open('data/config.json', "r") as f:
    cfg = json.load(f)


GPU check

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


I0000 00:00:1723233508.564497   39425 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1723233508.585530   39425 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1723233508.585656   39425 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [38]:
def calculate_error(model, X, Y):
    predictions = model.predict(X)
    mse = np.mean((predictions - Y) ** 2, axis=1)
    return mse

def sample_N_to_quartiles_min(x, N, y, return_indices=True):


    """Calculate quartiles and min. Return the one you want.

    Returns
    -------
    array : an array of N values that represent the kind specified in y.


    """
    quartiles = np.percentile(x, [25, 50, 75])
    minimum = np.min(x)
    maximum = np.max(x)
    
    # Function to find the closest N values to a given value
    def closest_values(arr, value, N, return_indices):
        indices = np.argsort(np.abs(arr - value))[:N]
        return indices if return_indices else arr[indices]
    
    if y == "q1":
        return closest_values(x, quartiles[0], N)
    if y == "q2":
        return closest_values(x, quartiles[1], N)
    if y == "q3":
        return closest_values(x, quartiles[2], N)
    if y == "min":
        return closest_values(x, minimum, N)
    if y == "max":
        return closest_values(x, maximum, N)


In [20]:
frequencies = {}

for word in bottom_500['word']:
    rowmatch = top_500[top_500['word']==word]
    if not rowmatch.empty:
        frequencies[word] = rowmatch['frequency'].values[0]+1
    # all frequencies should be present, but just in case...
    else:
        frequencies[word] = 1


In [21]:
def scale_frequencies(frequencies):
    """
    Scales the given word frequencies between 0 and 1.
    
    Parameters:
    frequencies (dict): A dictionary where keys are words and values are their frequencies.
    
    Returns:
    dict: A dictionary with scaled frequencies.
    """
    max_freq = max(frequencies.values())
    scaled_frequencies = {word: freq / max_freq for word, freq in frequencies.items()}
    return scaled_frequencies

In [16]:
scaled_frequencies = scale_frequencies(frequencies)
frequency_weights = np.array([scaled_frequencies[word] for word in words])

## Representational domain #1: top 500 most frequent words
Condition 1: maximum on distribution of MSE (i.e., the hardest condition)

In [24]:
train_word_indices

array([2800])

In [None]:
# Training loop

model = learner(X, Y, cfg['seed'], hidden=cfg['hidden_units'], optimizer=Adam(learning_rate=cfg['learning_rate']))

with open('')

for epoch in range(cfg['epochs']):
    
    mse = calculate_error(model, X[train_word_indices], Y[train_word_indices])
    selected_indices = sample_N_to_quartiles_min(mse, cfg['N'], y = "max", return_indices=True)
    
    # Find indices of the 20 items around the median MSE
    sorted_indices = np.argsort(np.abs(mse - median_mse))
    selected_indices = sorted_indices[:cfg['N']]
    
    x = X[selected_indices]
    y = Y[selected_indices]
    
    # Train the model on the selected items
    model.fit(X[train_word_indices], Y[train_word_indices], epochs=1, verbose=1)

    loss_train, accuracy_train, mse_train = model.evaluate(X[train_word_indices], Y[train_word_indices], verbose=0) 
    loss_test, accuracy_test, mse_test = model.evaluate(X[~train_word_indices], Y[~train_word_indices], verbose=0) 

    f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format(
                                    hidden,
                                    learning_rate,
                                    batch_size,
                                    epochs,
                                    loss_train,
                                    accuracy_train,
                                    mse_train,
                                    loss_test,
                                    accuracy_test,
                                    mse_test,
                                    runtime))
