In [None]:
from pathlib import Path
import pandas as pd
import re
import sys

# Explicit raw data folder (contains genotype subfolders, each with replicate subfolders)
raw_dir = Path(r"E:\Todd\OSR\251128\OSR compiled data all genotypes\data\raw")
compiled_dir = Path(r"E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled")
raw_dir.mkdir(parents=True, exist_ok=True)
compiled_dir.mkdir(parents=True, exist_ok=True)

# We'll search inside raw_dir; genotypes are one level down, replicates one level further
search_dir = raw_dir
out_dir = compiled_dir

# Pattern: capture metric from filenames like 'compiled_area.csv' or 'area.csv'
pattern = re.compile(r'^(?:.*?compiled_)?(?P<metric>.+?)\.csv$', re.IGNORECASE)


def gather_files(dirpath: Path):
    groups = {}
    if not dirpath.exists():
        return groups
    for genotype_dir in sorted(dirpath.iterdir()):
        if not genotype_dir.is_dir():
            continue
        for replicate_dir in sorted(genotype_dir.iterdir()):
            if not replicate_dir.is_dir():
                continue
            for f in replicate_dir.glob('*.csv'):
                m = pattern.match(f.name)
                if m:
                    metric = m.group('metric').strip()
                else:
                    metric = f.stem
                genotype = genotype_dir.name
                groups.setdefault(genotype, []).append((f, metric))
    return groups


def compile_tracks_side_by_side(genotype: str, files_metrics, metric_keyword: str, out_dir: Path):
    """Combine CSVs for a given metric by placing all track columns side-by-side.
    Produces a single file per genotype per metric (e.g., genotype_compiled_area.csv).
    """
    metric_files = [f for f, metric in files_metrics if metric_keyword.lower() in metric.lower()]
    if not metric_files:
        return None

    ref_ft = None
    tracks_list = []

    for f in sorted(metric_files):
        try:
            df = pd.read_csv(f, skiprows=1)
        except Exception as e:
            print(f'Warning reading {f}: {e}', file=sys.stderr)
            continue

        if 'Frame' not in df.columns or 'Time' not in df.columns:
            print(f'Warning: {f} missing Frame/Time columns', file=sys.stderr)
            continue

        ft = df[['Frame', 'Time']].reset_index(drop=True)
        if ref_ft is None:
            ref_ft = ft
        else:
            if not (ft['Frame'].equals(ref_ft['Frame']) and ft['Time'].equals(ref_ft['Time'])):
                print(f'Warning: Frame/Time mismatch in {f}', file=sys.stderr)

        # track columns are all columns except Frame and Time
        track_cols = [c for c in df.columns if c not in ('Frame', 'Time')]
        if not track_cols:
            continue

        # make unique column names by prefixing with file stem
        stem = re.sub(r"[^0-9A-Za-z_-]", "_", f.stem)
        renamed = {c: f"{stem}_{c}" for c in track_cols}
        tracks = df[track_cols].rename(columns=renamed).reset_index(drop=True)

        tracks_list.append(tracks)

    if not tracks_list or ref_ft is None:
        return None

    # concatenate all track columns side-by-side
    all_tracks = pd.concat(tracks_list, axis=1)
    out_df = pd.concat([ref_ft, all_tracks], axis=1)

    # Ensure Frame and Time are first columns
    cols = list(out_df.columns)
    ordered = []
    for c in ['Frame', 'Time']:
        if c in cols:
            ordered.append(c)
            cols.remove(c)
    ordered.extend(cols)
    out_df = out_df[ordered]

    safe_metric = re.sub(r"[^0-9A-Za-z_-]", "_", metric_keyword.lower())
    out_path = out_dir / f"{genotype}_compiled_{safe_metric}.csv"
    out_df.to_csv(out_path, index=False)
    return out_path


# Inspect available files and groups
groups = gather_files(search_dir)
print(f'Raw data dir: {search_dir}')
print(f'Output dir: {out_dir}')
print(f'Found {len(groups)} genotypes: {sorted(groups.keys())}')

Raw data dir: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\raw
Output dir: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled
Found 6 genotypes: ['N2', 'dr170', 'dr170_dr180', 'dr180', 'osm6_p811', 'p811_dr170']


In [None]:
# Compile all discovered genotypes and write outputs (one file per genotype per metric)
results = {}
rows = []
metrics = ['area', 'length', 'fit']
for g, files in sorted(groups.items()):
    results[g] = {}
    for metric in metrics:
        p = compile_tracks_side_by_side(g, files, metric, out_dir)
        if p:
            print(f"Saved compiled file for {g} metric {metric}: {p}")
            rows.append({'genotype': g, 'metric': metric, 'compiled_path': str(p)})
            results[g][metric] = p
        else:
            print(f'No {metric} data saved for {g}')

summary = pd.DataFrame(rows)
display(summary)
# write a small summary CSV into the compiled folder
summary.to_csv(out_dir / 'compilation_summary.csv', index=False)
print('Wrote summary to', out_dir / 'compilation_summary.csv')

Saved compiled file for N2 metric WT 550 0003 Area: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\N2_compiled_WT_550_0003_Area.csv
Saved compiled file for N2 metric WT 550 0003 Fit: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\N2_compiled_WT_550_0003_Fit.csv
Saved compiled file for N2 metric WT 550 0003 Length: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\N2_compiled_WT_550_0003_Length.csv
Saved compiled file for N2 metric WT 550 0003 Width: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\N2_compiled_WT_550_0003_Width.csv
Saved compiled file for N2 metric WT2 5500001 Fit: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\N2_compiled_WT2_5500001_Fit.csv
Saved compiled file for N2 metric WT2 5500001 Length-1: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\N2_compiled_WT2_5500001_Length-1.csv
Saved compiled file for N2 metric WT2 5500001 Width: E:\Todd\OSR\251128\OSR compiled data 



Saved compiled file for p811_dr170 metric Area: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\p811_dr170_compiled_Area.csv
Saved compiled file for p811_dr170 metric Fit: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\p811_dr170_compiled_Fit.csv
Saved compiled file for p811_dr170 metric Length: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\p811_dr170_compiled_Length.csv
Saved compiled file for p811_dr170 metric Position: E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\p811_dr170_compiled_Position.csv


Unnamed: 0,genotype,metric,compiled_path
0,N2,WT 550 0003 Area,E:\Todd\OSR\251128\OSR compiled data all genot...
1,N2,WT 550 0003 Fit,E:\Todd\OSR\251128\OSR compiled data all genot...
2,N2,WT 550 0003 Length,E:\Todd\OSR\251128\OSR compiled data all genot...
3,N2,WT 550 0003 Width,E:\Todd\OSR\251128\OSR compiled data all genot...
4,N2,WT2 5500001 Fit,E:\Todd\OSR\251128\OSR compiled data all genot...
...,...,...,...
59,osm6_p811,osm6p811 5500003 Width,E:\Todd\OSR\251128\OSR compiled data all genot...
60,p811_dr170,Area,E:\Todd\OSR\251128\OSR compiled data all genot...
61,p811_dr170,Fit,E:\Todd\OSR\251128\OSR compiled data all genot...
62,p811_dr170,Length,E:\Todd\OSR\251128\OSR compiled data all genot...


Wrote summary to E:\Todd\OSR\251128\OSR compiled data all genotypes\data\compiled\compilation_summary.csv


## Notes
- To change the input folder, modify the `data_dir` value in the first code cell.
- The code uses the filename pattern `'<genotype> compiled_<metric>.csv'`. If your files use a different convention, update the `pattern` regex.
- Each saved CSV includes three added columns: `genotype`, `metric`, and `source_file`.