In [None]:
# python: base (3.11.4)

import numpy as np
import json
import pandas as pd
import time
import random

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import Callback

from sklearn.metrics.pairwise import cosine_similarity

from tensorflow.keras.models import Model
import pandas as pd

from src.learner import *
from utilities import *

class CustomCallback(Callback):

    def __init__(self, accuracy_criteria=1.0, accuracy_metric_name='binary_accuracy'):
        super(CustomCallback, self).__init__()
        self.accuracy_criteria = accuracy_criteria
        self.accuracy_metric_name = accuracy_metric_name

    def on_epoch_end(self, epoch, logs=None):
        accuracy = logs.get(self.accuracy_metric_name)
        if accuracy is not None:
            print(f"Epoch {epoch+1}: Accuracy = {accuracy}")
            if accuracy >= self.accuracy_criteria:
                print(f"Reached {self.accuracy_criteria * 100}% accuracy, stopping training!")
                self.model.stop_training = True
        else:
            print(f"Epoch {epoch+1}: Accuracy not found in logs.")


# data
kidwords = pd.read_csv('data/kidwords/kidwords.csv', header=None)[0].tolist()
kidwords_frequencies_from_tasa = pd.read_csv('data/kidword_frequencies_from_tasa.csv') # some words from kidwords aren't represented so we have to impute 0


top_500 = pd.read_csv('data/top_500.csv')
bottom_500 = pd.read_csv('data/infrequent_500.csv')
train_word_indices = np.array([i for i, e in enumerate(kidwords) if e in top_500['word'].values])

control_word_indices = np.array([i for i, e in enumerate(kidwords) if e in bottom_500['word'].values])

XX = np.genfromtxt('data/kidwords/orth.csv', delimiter=",")
YY = np.genfromtxt('data/kidwords/phon.csv', delimiter=",")

non_zero_a = np.any(XX != 0, axis=0)
X = XX[:, non_zero_a]

non_zero_b = np.any(YY != 0, axis=0)
Y = YY[:, non_zero_b]

# configs
with open('data/config.json', "r") as f:
    cfg = json.load(f)

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) # may not work depending on kernel

In [None]:

def find_nearest_neighbors(x, indices, N):
    
    """Find the indices of the nearest N neighbors for each element along axis 0 (rows) of the similarity matrix.

    Parameters
    ----------
    x (numpy.ndarray): A 2D array, the similarity matrix.
    
    N (int): The number of nearest neighbors you'd like the indices of.

    Returns
    -------
    numpy.ndarray: A 2D array. Each row contains the indices of the nearest N neighbors for the corresponding element provided by the original indices.
    """
    # Ensure the similarity matrix is a numpy array
    similarity_matrix = np.array(x)
    
    # Get the indices of the sorted similarities in descending order
    sorted_indices = np.argsort(-similarity_matrix, axis=1)
    
    # Select the top N indices for each row
    nearest_neighbors_indices = sorted_indices[:, 1:N + 1]  # Exclude the first index (self-similarity)
    
    return np.array(indices)[nearest_neighbors_indices]


In [None]:
model = learner(X, Y, cfg['seed'], cfg['hidden_units'], optimizer='adam')
model.fit(X[train_word_indices], Y[train_word_indices], epochs=cfg['epochs'], batch_size=cfg['batch_size'], verbose=True)

In [None]:
# Create a new model that outputs the hidden layer activations


hidden_layer_model = Model(inputs=model.input, outputs=model.layers[0].output)

hidden_activations = hidden_layer_model.predict(X)

# Convert the activations to a DataFrame
df = pd.DataFrame(hidden_activations)

df.to_csv('outputs/pilot_1_hidden_unit_activations.csv')


In [None]:
# Compute the similarity matrix
similarities = cosine_similarity(hidden_activations)

np.savetxt('outputs/pilot_1_hidden_unit_activation_similarities.csv', similarities, delimiter = ",", fmt = '%d')