In [1]:
import sys
import os
import pandas as pd

IN_PATH = "all_metrics_standarized.csv"
OUT_A = "input_pca_set_A_.csv"
OUT_B = "input_pca_set_B_.csv"

desired_A_display = [
    "spike sync",
    "assembly variety",
    "cv",
    "cv2",
    "fano factor",
    "isi distance",
    "largest assembly",
    "max cluster",
    "mean firing rate",
    "spike contrast",
]

desired_B_display = [
    "assembly variety",
    "cv",
    "fano factor",
    "isi distance",
    "largest assembly",
    "max cluster",
    "mean firing rate",
    "spike contrast",
]

In [2]:
def _norm(s: str) -> str:
    return "".join(ch for ch in s.lower() if ch.isalnum())

alias_normalized = {
    "pikesync": "spikesync",
}

In [3]:
def _build_actual_lookup(actual_cols):
    norm_to_actual = {}
    for c in actual_cols:
        n = _norm(c)
        norm_to_actual.setdefault(n, c)
    return norm_to_actual

In [4]:
def _resolve_columns(desired_display, actual_cols):
    norm_to_actual = _build_actual_lookup(actual_cols)
    resolved = []
    missing = []

    for name in desired_display:
        n = _norm(name)
        n = _norm(alias_normalized.get(n, n))
        if n in norm_to_actual:
            resolved.append(norm_to_actual[n])
        else:
            snake_guess = name.strip().lower().replace(" ", "_")
            n2 = _norm(snake_guess)
            if n2 in norm_to_actual:
                resolved.append(norm_to_actual[n2])
            else:
                missing.append(name)

    return resolved, missing

In [5]:
def _verify_columns(filepath, desired_display):
    df_check = pd.read_csv(filepath, nrows=0)  # read only header
    cols = list(df_check.columns)
    resolved_against_written, missing = _resolve_columns(desired_display, cols)

    order_match = resolved_against_written == cols
    content_match = set(_norm(c) for c in resolved_against_written) == set(_norm(c) for c in cols)

    return {
        "file": filepath,
        "written_columns": cols,
        "resolved_from_desired": resolved_against_written,
        "missing_from_desired": missing,
        "order_match": order_match,
        "content_match": content_match,
    }

if not os.path.exists(IN_PATH):
    print(f"ERROR: Input file not found: {IN_PATH}", file=sys.stderr)
    sys.exit(1)

df = pd.read_csv(IN_PATH)

In [6]:
resolved_A, missing_A = _resolve_columns(desired_A_display, df.columns)
if missing_A:
    print("ERROR: The following requested Set A columns could not be found in the input (after normalization/aliases):")
    for m in missing_A:
        print(f"  - {m}")
    print("\nAvailable columns in input:")
    for c in df.columns:
        print(f"  - {c}")
    sys.exit(2)

df_A = df[resolved_A].copy()
df_A.to_csv(OUT_A, index=False)
print(f"Saved {OUT_A} with columns (in order): {resolved_A}")

Saved input_pca_set_A_.csv with columns (in order): ['spike_sync', 'assembly_variety', 'cv', 'cv2', 'fano_factor', 'isi_distance', 'largest_assembly', 'max_cluster', 'mean_firing_rate', 'spike_contrast']


In [7]:
resolved_B, missing_B = _resolve_columns(desired_B_display, df.columns)
if missing_B:
    print("ERROR: The following requested Set B columns could not be found in the input (after normalization/aliases):")
    for m in missing_B:
        print(f"  - {m}")
    print("\nAvailable columns in input:")
    for c in df.columns:
        print(f"  - {c}")
    sys.exit(3)

df_B = df[resolved_B].copy()
df_B.to_csv(OUT_B, index=False)
print(f"Saved {OUT_B} with columns (in order): {resolved_B}")

Saved input_pca_set_B_.csv with columns (in order): ['assembly_variety', 'cv', 'fano_factor', 'isi_distance', 'largest_assembly', 'max_cluster', 'mean_firing_rate', 'spike_contrast']


In [8]:
print("\nVerifying written files against the requested lists...")

report_A = _verify_columns(OUT_A, desired_A_display)
report_B = _verify_columns(OUT_B, desired_B_display)

def _print_report(tag, report):
    print(f"\n[{tag}] {report['file']}")
    print(f"  Columns written: {report['written_columns']}")
    if report["missing_from_desired"]:
        print(f"  Missing (unresolved) from requested list: {report['missing_from_desired']}")
    print(f"  Order matches requested list? {'YES' if report['order_match'] else 'NO'}")
    print(f"  Content matches requested set (ignoring order)? {'YES' if report['content_match'] else 'NO'}")
    if not report["order_match"]:
        print("  NOTE: If this says NO but 'content' says YES, the file still has all requested columns but order differs.")

_print_report("Set A", report_A)
_print_report("Set B", report_B)

exit_code = 0
if (not report_A["content_match"]) or (not report_B["content_match"]) or (not report_A["order_match"]) or (not report_B["order_match"]):
    exit_code = 4

sys.exit(exit_code)


Verifying written files against the requested lists...

[Set A] input_pca_set_A_.csv
  Columns written: ['spike_sync', 'assembly_variety', 'cv', 'cv2', 'fano_factor', 'isi_distance', 'largest_assembly', 'max_cluster', 'mean_firing_rate', 'spike_contrast']
  Order matches requested list? YES
  Content matches requested set (ignoring order)? YES

[Set B] input_pca_set_B_.csv
  Columns written: ['assembly_variety', 'cv', 'fano_factor', 'isi_distance', 'largest_assembly', 'max_cluster', 'mean_firing_rate', 'spike_contrast']
  Order matches requested list? YES
  Content matches requested set (ignoring order)? YES


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
