In [2]:
import os
import string 
import json
import numpy as np
import pandas as pd 
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


In [3]:
# PATH to NAS mount 
PATH = '/home/ge84yes/data'

In [4]:
# Util functions 

def list_all_subfolders(start_path):
    subfolders = []
    for root, dirs, _ in os.walk(start_path):
        for d in dirs:
            subfolders.append(os.path.join(root, d))
    return subfolders



def list_direct_files(folder_path):
    return [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if os.path.isfile(os.path.join(folder_path, f))
    ]

# 2) Worker that returns a tuple of values (to avoid shared-list contention)
def parse_one(file):
    with open(file, "rb") as f:      # rb can be slightly faster
        jd = json.load(f)
    lr = jd["long_run"]
    # Note: Your original code swapped ee/ie names; use the keys you meant:
    pruning_ee = lr["pruning_rate_ee"]
    pruning_ie = lr["pruning_rate_ie"]
    big_ee = lr["big_weights_ee"]
    big_ie = lr["big_weights_ie"]
    rates_e = lr["rates_e"]
    rates_i = lr["rates_i"]
    return pruning_ee, pruning_ie, big_ee, big_ie, rates_e, rates_i

We extract all possible combinations of folders

In [5]:
level_1_folders = list_all_subfolders(PATH)
level_2_folders = list(set([e.lower() for e in string.ascii_letters]))

In [None]:
# 1) Build list of files once
dirs = [Path(PATH) / lv1 / lv2 for lv1 in level_1_folders for lv2 in level_2_folders]
print(len(dirs))
dirs = [d for d in tqdm(dirs) if d.exists()]
print(len(dirs))
files = []
for d in tqdm(dirs):
    # replace with your own list_direct_files if needed
    files.extend(list_direct_files(str(d)))

We parallize the reading of files, NAS is very slow when it comes to I/O operations

In [None]:
# 3) Run in parallel and aggregate
all_pruning_ee, all_pruning_ie = [], []
all_bigweights_ee, all_bigweights_ie = [], []
all_rates_e, all_rates_i = [], []

In [None]:
with ThreadPoolExecutor(max_workers=min(32, (os.cpu_count() or 4) * 5)) as ex:
    for res in tqdm(ex.map(parse_one, files, chunksize=50), total=len(files)):
        p_ee, p_ie, bw_ee, bw_ie, r_e, r_i = res
        all_pruning_ee.append(p_ee)
        all_pruning_ie.append(p_ie)
        all_bigweights_ee.append(bw_ee)
        all_bigweights_ie.append(bw_ie)
        all_rates_e.append(r_e)
        all_rates_i.append(r_i)

We create a pandas df 

In [None]:
pruning_rate_ie = np.array(all_pruning_ie)
pruning_rate_ee = np.array(all_pruning_ee)
rate_e = np.array(all_rates_e)
rate_i = np.array(all_rates_i)
big_ee = np.array(all_bigweights_ee)
big_ie = np.array(all_bigweights_ie)
files = np.array(files)

print(files.shape)
print(pruning_rate_ee.shape)
df_data = {"last_pruning_rate_ie" : pruning_rate_ie[:,-1],
           "last_pruning_rate_ee" : pruning_rate_ee[:,-1],
           "last_rate_e" : rate_e[:,-1],
           "last_rate_i" : rate_i[:,-1],
           "last_big_ee" : big_ee[:,-1],
           "last_big_ie" : big_ie[:,-1],
           'files' : files
           }

df =pd.DataFrame(df_data)


We filter out the good simulations

In [None]:
good = df[(df['last_pruning_rate_ie'] < 200) & (df['last_pruning_rate_ee'] < 200) & (df['last_big_ee'] < 0.2) &  (df['last_big_ie'] < 0.2)]
good_files = set(good['files'].values) 
bad_files = set(files) - good_files
print(f" Found {len(good_files)} examples so far ...")
print(f" Found {len(bad_files)} example that can be deleted ...")

Remove the bad files from disk

In [None]:
for bad in bad_files:
    os.remove(bad)

Save the good files into a txt file

In [None]:
with open('good.txt', 'w') as f:
    f.write("\n".join(good_files)) 