In [1]:
import os
import numpy as np
import pandas as pd


# Set the print options
np.set_printoptions(suppress=True)


In [2]:
samples = np.genfromtxt('data/samples.csv', delimiter=",").astype(bool)
holdouts = np.genfromtxt('data/holdouts.csv', delimiter=",").astype(bool)
tests = np.genfromtxt('data/tests.csv', delimiter=",").astype(bool)

Extract the accuracy data for each sample and write to file. This takes a minute because it requires iterating through all the written accuracy files. Note that the accuracy summary datafile writes the accuracies out in a random order over rows, so you might need to arrange the rows if you want them to be in an orderly fashion.

In [3]:
# Specify the directory
directory = 'outputs/brute_force_2/'

# Prepare an empty list to store the results
results = []

# Specify the indices

# Iterate over every file in the specified directory
for filename in os.listdir(directory):
    if filename.endswith("accuracies.csv"):
        # Extract the ID and level from the filename
        id_start = filename.find("sample_") + len("sample_")
        id_end = filename.find("hidden")
        ID = int(filename[id_start:id_end].replace("_", ""))

        level_start = filename.find("hidden_") + len("hidden_")
        level_end = filename.find("accuracies")
        level = int(filename[level_start:level_end].replace("_", ""))

        # Read the csv file
        data = np.genfromtxt(os.path.join(directory, filename), delimiter=',')

        # Tabulate the number of values equal to one and less than one
        equal_to_one = np.sum(data == 1)
        less_than_one = np.sum(data < 1)

        # Tabulate the values for each set of indices
        # if you want to determine
        train = np.sum(data[samples[:,ID]] == 1)/sum(samples[:,ID])
        test = np.sum(data[tests[:,ID]] == 1)/sum(tests[:,ID])
        holdout = np.sum(data[holdouts[:,ID]] == 1)/sum(holdouts[:,ID])

        # Save the results
        results.append([ID, level, train, test, holdout])

# Convert the results to a numpy array and write to a csv file
results_array = np.array(results)


pd.DataFrame(results_array).to_csv('outputs/brute_force_2/tabulated_accuracy.csv', index = False, header = ["model_id", "hidden", "train", "test", "holdout"])


Generate a random sample of 200 model IDs to make the subset data for the LMEM (which DK is running)

In [4]:
import random
random.seed(345)

IDs = [d[0] for d in results_array]

subset_models = random.sample(IDs, 200)
subset_IDs = [str(int(ID)).zfill(4) for ID in subset_models]

In [5]:
numbers = [str(i) for i in range(10000)]
subset = random.sample(numbers, 200)


filenames = []
for filename in os.listdir(directory):
    for ID in subset:
        if filename.startswith("sample_"+ID+"_"):
            if "accuracies" in filename:
              if "hidden_20" in filename or "hidden_100" in filename:
                  filenames.append(filename)  

In [9]:
import pandas as pd
words = pd.read_csv('data/kidwords/kidwords.csv', names = ["word"])

In [11]:
tmp = int(ID)

samples[:,tmp].

1

In [13]:
tests.shape

(2869, 10000)

In [14]:
import pandas as pd

dfs = []

for filename in filenames:

    id_start = filename.find("sample_") + len("sample_")
    id_end = filename.find("hidden")
    ID = int(filename[id_start:id_end].replace("_", ""))

    level_start = filename.find("hidden_") + len("hidden_")
    level_end = filename.find("accuracies")
    level = int(filename[level_start:level_end].replace("_", ""))

    one_file = pd.read_csv(f'outputs/brute_force_2/{filename}', names = ["mse"])
    one_file['word'] = words['word']
    one_file['model_id'] = ID
    one_file['hidden'] = level
    one_file['train'] = samples[:,ID]
    one_file['test'] = tests[:,ID]
    one_file['holdout'] = holdouts[:,ID]
    dfs.append(one_file)

df = pd.concat(dfs, ignore_index=True)

df['accuracy'] = df['mse'].apply(lambda x: 0 if x < 1 else 1)

In [17]:
df.to_csv('outputs/brute_force_2/subset_data_for_lmem.csv', index=False)