In [8]:
import os  
from pathlib import Path 
import re 

In [15]:
base_path = Path('./experiments/MNIST_STRATIFIED_CLASSIFIERS_MADGAN_NEW/')
experiments = os.listdir(base_path)

In [16]:

trained_generators = [3, 5, 7, 10]
n_images_per_class = [ 
    (0, 5000), (1000, 4000), (2000, 3000), (3000, 2000), (4000, 1000), (5000, 0),
    (5000, 1000), (5000, 2000), (5000, 3000), (5000, 4000), (5000, 5000)
]


In [17]:
def extract_info_from_experiment_name(exp: str) -> dict:
    """
    Extracts metadata from experiment name, handling different naming patterns.
    """
    # Pattern attempts to find n_gen in two places, plus used_gen, n_real, n_fake
    pattern = (
        r".*?_"
        r"(?P<dataset>[A-Z0-9]+)_+" # Dataset (e.g., MNIST)
        r"(?:(?P<n_gen_early>\d+)_+)?" # Optional early n_gen (e.g., __7_)
        r"(?:.*?" # Separator
           r"used_generator_(?P<used_gen>\d+)_+"
        r")?" # Optional used_gen block
        r"(?:.*?" # Separator
           r"trained_generators_(?P<n_gen_trained>\d+)_+"
        r")?" # Optional trained_generators block
        r".*?" # Separator
        r"images_real_(?P<n_real>\d+)_+" # n_real
        r"gen_(?P<n_fake>\d+)" # n_fake
        r".*$" # Rest of string
    )

    match = re.search(pattern, exp)

    # Default values
    info = {'dataset': None, 'n_gen': 0, 'used_gen': 0, 'n_real': -1, 'n_fake': -1, 'parse_success': False}

    if not match:
        # Basic check: Does it even contain image counts? If not, definitely skip.
        if "images_real_" not in exp or "_gen_" not in exp:
             print(f"Skipping (no image info?): {exp}")
             return info
        # If it has image counts but didn't match complex pattern, try simpler extraction
        # This might happen for baseline names without generator info
        simple_pattern = r".*?_(?P<dataset>[A-Z0-9]+)_+.*?images_real_(?P<n_real>\d+)_+gen_(?P<n_fake>\d+).*$"
        simple_match = re.search(simple_pattern, exp)
        if simple_match:
             simple_groups = simple_match.groupdict()
             info['dataset'] = simple_groups.get('dataset')
             info['n_real'] = int(simple_groups.get('n_real', -1))
             info['n_fake'] = int(simple_groups.get('n_fake', -1))
             # Since no generator info was reliably parsed, leave n_gen/used_gen as 0/default
             # Check if it's the specific baseline ratio
             if info['n_real'] == 5000 and info['n_fake'] == 0:
                  info['parse_success'] = True # Mark baseline as successfully parsed
                  print(f"Parsed as Baseline: {exp} -> {info}")
             else:
                  print(f"Warning: Parsed name '{exp}' partially (simple pattern), non-baseline ratio.")
             return info
        else:
             print(f"ALARM: Could not parse experiment name: '{exp}'")
             return info # Return default info with parse_success=False


    # Process the main pattern match
    groups = match.groupdict()
    info['parse_success'] = True # Assume success if main pattern matched
    info['dataset'] = groups.get('dataset')

    # Determine n_gen (prefer 'trained_generators' if present)
    n_gen_trained_str = groups.get('n_gen_trained')
    n_gen_early_str = groups.get('n_gen_early')
    if n_gen_trained_str:
        info['n_gen'] = int(n_gen_trained_str)
    elif n_gen_early_str:
        info['n_gen'] = int(n_gen_early_str)
    else:
         # If neither n_gen pattern matched, maybe it's an error or needs different handling
         print(f"Warning: Could not determine n_gen for '{exp}'. Defaulting to 0.")
         info['n_gen'] = 0 # Default or raise error?

    info['used_gen'] = int(groups.get('used_gen') or 0) # Default 0 if missing
    info['n_real'] = int(groups.get('n_real') or -1) # Default -1 if missing
    info['n_fake'] = int(groups.get('n_fake') or -1) # Default -1 if missing

    # Add a check if parsed values seem inconsistent (e.g., used_gen >= n_gen)
    if info['n_gen'] > 0 and info['used_gen'] >= info['n_gen']:
        print(f"Warning: Inconsistent parse? used_gen ({info['used_gen']}) >= n_gen ({info['n_gen']}) for '{exp}'")
        # info['parse_success'] = False # Optionally invalidate if needed

    return info

In [18]:
infos = []
for exp in experiments: 
    print(exp)
    d = extract_info_from_experiment_name(exp)
    print(d)
    break

2025-04-18_Stratified_classifierExperiment_MNIST___MADGAN__used_generator_0__trained_generators_10__images_real_1000_gen_4000
{'dataset': 'MNIST', 'n_gen': 10, 'used_gen': 0, 'n_real': 1000, 'n_fake': 4000, 'parse_success': True}


In [22]:
expected_experiments = []
for trained_gen in trained_generators:
    for used_gen in range(trained_gen): 
        for t in n_images_per_class:
            expected_experiments.append(
                (trained_gen, used_gen, t[0], t[1])
            )
len(expected_experiments)

275