In [1]:
from pathlib import Path
import pandas as pd

def list_paths_and_stats(
    measurement: str,
    root: str | Path = "../dataset_clean",
    validation_root: str | Path = "../dataset_clean_validation",
    *,
    min_ok: float = 49.75,
    max_ok: float = 50.25,
    treat_missing_as_corrupted: bool = True,
):
    """
    Scan measurement files, attach stats (min/max/mean) from the validation CSVs,
    classify each file as validated or corrupted according to min_ok/max_ok limits,
    and print a final summary of ALL, VALIDATED, and CORRUPTED files.

    Parameters
    ----------
    measurement : str
        Measurement name (e.g., "Freq"). Function searches for */<measurement>/*_<measurement>.csv.xz
    root : str | Path
        Root folder containing the cleaned dataset (default: ../dataset_clean)
    validation_root : str | Path
        Root folder containing per-site per-measurement stats CSVs (default: ../dataset_clean_validation)
    min_ok : float
        Lower bound (inclusive) for valid values of the measurement (default: 49.75)
    max_ok : float
        Upper bound (inclusive) for valid values of the measurement (default: 50.25)
    treat_missing_as_corrupted : bool
        If True, files with missing stats (no row match or missing columns) are marked corrupted.

    Returns
    -------
    all_measurement_paths : list[Path]
        Every matched file path.
    validated_measurement : list[dict]
        Each entry: {'path': Path, 'min': float|None, 'max': float|None, 'mean': float|None}
    corrupted_measurement : list[dict]
        Same shape as validated, plus diagnostic flags:
        {'path', 'min', 'max', 'mean', 'reason': {'stats_missing': bool, 'min_below': bool, 'max_above': bool}}
    """
    root = Path(root)
    validation_root = Path(validation_root)

    # --- 1) Discover all measurement files under root ---
    paths = sorted(root.rglob(f"*/{measurement}/*_{measurement}.csv.xz"))
    print(f"Found {len(paths)} file(s) for '{measurement}':")

    # Containers for results
    validated = []
    corrupted = []

    # --- 2) Iterate files, read matching stats row, classify ---
    for p in paths:
        # Derive stats CSV location and expected row's file_path (strict, Windows-style, relative to dataset_clean).
        site = p.parents[1].name                  # .../<site>/<measurement>/<file>
        year = p.stem.split("_", 1)[0]            # "<year>_<measurement>.csv.xz" -> "<year>"
        stats_path = validation_root / site / measurement / f"{year}_{measurement}_stats.csv"
        expected_rel = f"dataset_clean\\{site}\\{measurement}\\{year}_{measurement}.csv.xz"

        # Defaults if we cannot find stats
        min_v = max_v = mean_v = None
        stats_missing = True

        # Try load stats and extract the exact row by file_path (strict match, backslashes)
        if stats_path.exists():
            df = pd.read_csv(stats_path, dtype={"file_path": "string"})
            if "file_path" in df.columns:
                fp_series = df["file_path"].astype("string").str.replace("/", "\\", regex=False)
                row = df[fp_series == expected_rel]
                if not row.empty:
                    # Pull columns if present; keep None if absent
                    min_v  = row["min"].iloc[0]  if "min"  in row.columns else None
                    max_v  = row["max"].iloc[0]  if "max"  in row.columns else None
                    mean_v = row["mean"].iloc[0] if "mean" in row.columns else None
                    stats_missing = any(
                        k not in row.columns for k in ("min", "max", "mean")
                    )

                    # If present but NaN, treat as missing
                    for val in (min_v, max_v, mean_v):
                        try:
                            # Coerce pandas NA to Python None-like via float() check
                            _ = float(val)
                        except Exception:
                            stats_missing = True
                            break

        # Classification rules:
        # - If stats are missing and treat_missing_as_corrupted=True => corrupted(stats_missing=True)
        # - Else check thresholds on min and max:
        #     min_ok <= min_v AND max_v <= max_ok => validated
        #     otherwise => corrupted with reason flags
        min_below = max_above = False
        reason = {"stats_missing": False, "min_below": False, "max_above": False}

        if stats_missing and treat_missing_as_corrupted:
            reason["stats_missing"] = True
            corrupted.append({
                "path": p,
                "min": min_v,
                "max": max_v,
                "mean": mean_v,
                "reason": reason
            })
        else:
            # Only check limits if we actually have numeric min/max
            try:
                min_val = float(min_v) if min_v is not None else float("nan")
                max_val = float(max_v) if max_v is not None else float("nan")
            except Exception:
                # Non-numeric stats encountered -> treat as missing
                reason["stats_missing"] = True
                corrupted.append({
                    "path": p,
                    "min": min_v,
                    "max": max_v,
                    "mean": mean_v,
                    "reason": reason
                })
                continue

            # Evaluate limit flags (NaN will fail these comparisons and trigger corruption)
            try:
                min_below = not (min_val >= min_ok)
                max_above = not (max_val <= max_ok)
            except Exception:
                min_below = True
                max_above = True

            if (not min_below) and (not max_above):
                validated.append({"path": p, "min": min_v, "max": max_v, "mean": mean_v})
            else:
                reason["min_below"] = min_below
                reason["max_above"] = max_above
                corrupted.append({
                    "path": p,
                    "min": min_v,
                    "max": max_v,
                    "mean": mean_v,
                    "reason": reason
                })

        # Per-file one-line echo (keeps your current preview; NA if None)
        disp_path = str(p).replace("/", "\\")
        fmt = lambda x: "NA" if x is None else f"{float(x):.6g}"
        print(f"{disp_path}  -  Min {fmt(min_v)}  Max {fmt(max_v)}  Mean {fmt(mean_v)}")

    # --- 3) Print final summary (ALL, VALIDATED, CORRUPTED) ---
    def _format_entry(d):
        disp = str(d["path"]).replace("/", "\\")
        mmin = "NA" if d["min"]  is None else f"{float(d['min']):.6g}"
        mmax = "NA" if d["max"]  is None else f"{float(d['max']):.6g}"
        mavg = "NA" if d["mean"] is None else f"{float(d['mean']):.6g}"
        return f"{disp}  -  Min {mmin}  Max {mmax}  Mean {mavg}"

    #print("\n=== SUMMARY ===")
    #print(f"ALL ({len(paths)}):")
    #for p in paths:
    #    print(str(p).replace("/", "\\"))

    #print(f"\nVALIDATED ({len(validated)}):")
    #for d in validated:
    #    print(_format_entry(d))

    print(f"\nCORRUPTED ({len(corrupted)}):")
    #for d in corrupted:
    #    line = _format_entry(d)
    #    r = d.get("reason", {})
    #    flags = []
    #    if r.get("stats_missing"): flags.append("stats_missing")
    #    if r.get("min_below"):     flags.append(f"min<{min_ok}")
    #    if r.get("max_above"):     flags.append(f"max>{max_ok}")
    #    if flags:
    #        line += "  [" + ", ".join(flags) + "]"
    #    print(line)

    # --- 4) Return the three lists the caller may want to use programmatically ---
    return paths, validated, corrupted

paths, validated, corrupted = list_paths_and_stats("U1_RMS_fund", min_ok=-100000, max_ok=100000)

Found 11 file(s) for 'U1_RMS_fund':
..\dataset_clean\TEC_48S\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.806  Mean 236.197
..\dataset_clean\TEC_CFST161\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 240.782  Mean 235.881
..\dataset_clean\TEC_Chiron800\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.876  Mean 236.196
..\dataset_clean\TEC_CTX800TC\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.896  Mean 236.165
..\dataset_clean\TEC_DMF3008\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.74  Mean 237.02
..\dataset_clean\TEC_DMU125MB\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.784  Mean 237.042
..\dataset_clean\TEC_DNG50evo\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.932  Mean 236.204
..\dataset_clean\TEC_E110\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.666  Mean 236.954
..\dataset_clean\TEC_E30D2\U1_RMS_fund\2024_U1_RMS_fund.csv.xz  -  Min 0  Max 241.835  Mean 236.205
..\dataset_clean\TEC_JWA24\U1_RMS_fund\2024_U1_RMS_f

### Delete

In [35]:
import pandas as pd

df = pd.read_csv(r"../dataset_clean\EPI_TotalLoad\PF_total\2020_PF_total_vec.csv.xz")
df.describe()

Unnamed: 0,PF_total_vec
count,4504045.0
mean,0.6356666
std,0.6778203
min,-1.0
25%,0.877
50%,0.905
75%,0.936
max,1.0


In [1]:
[
    "I1_fund", "I2_fund", "I3_fund", "IN_fund", "U1_fund", "U2_fund", "U3_fund", "U12_fund", "U23_fund", "U31_fund",
    "cos_phi1", "cos_phi1_f", "cos_phi2", "cos_phi2_f", "cos_phi3", "cos_phi3_f",
    "Angle_UI1", "Angle_UI1_f", "Angle_UI2", "Angle_UI2_f", "Angle_UI3", "Angle_UI3_f",
    "PF_total_arith", "PF_total_arith_f",
    'Q_total_arith', 'Q_total_arith_f', 'S_total_arith', 'S_total_arith_f',
]



#Delete: PF_total_arith, PF_total_arith_f 


['I1_fund',
 'I2_fund',
 'I3_fund',
 'IN_fund',
 'U1_fund',
 'U2_fund',
 'U3_fund',
 'U12_fund',
 'U23_fund',
 'U31_fund',
 'cos_phi1',
 'cos_phi1_f',
 'cos_phi2',
 'cos_phi2_f',
 'cos_phi3',
 'cos_phi3_f']

In [27]:
import os
import shutil

def _unique_dest_dir(dest_dir: str) -> str:
    """
    If dest_dir exists, return a variant with _bakN appended to the last path segment.
    """
    if not os.path.exists(dest_dir):
        return dest_dir
    parent = os.path.dirname(dest_dir)
    base = os.path.basename(dest_dir)
    k = 1
    while True:
        cand = os.path.join(parent, f"{base}_bak{k}")
        if not os.path.exists(cand):
            return cand
        k += 1

def _move_dir(src_dir: str, backup_root: str, dry_run: bool) -> bool:
    """
    Move the entire src_dir into backup_root while mirroring the relative structure
    starting from the machine directory (i.e., <machine>/<measurement>).
    Returns True if moved (or would move in dry-run), False if skipped.
    """
    if not os.path.isdir(src_dir):
        return False

    # Build <backup_root>/<machine>/<measurement> destination
    # src_dir looks like: <root>/<machine>/<measurement>
    machine = os.path.basename(os.path.dirname(src_dir))
    measurement = os.path.basename(src_dir)
    dest_parent = os.path.join(backup_root, machine)
    os.makedirs(dest_parent, exist_ok=True)
    dest_dir = os.path.join(dest_parent, measurement)

    # Ensure uniqueness
    dest_dir = _unique_dest_dir(dest_dir)

    if dry_run:
        print(f"[DRY] MOVE DIR  {src_dir}  ->  {dest_dir}")
        return True

    try:
        shutil.move(src_dir, dest_dir)
        print(f"[OK]  MOVED DIR {src_dir}  ->  {dest_dir}")
        return True
    except Exception as e:
        print(f"[FAIL] MOVE DIR {src_dir}  (error: {e})")
        return False

def move_measurement_folders_for_epi_and_tec(
    measurements,
    root_clean: str = "../dataset_clean",
    root_stats: str = "../dataset_clean_validation",
    backup_clean_root: str = "./backup_clean_dirs",
    backup_stats_root: str = "./backup_validation_dirs",
    dry_run: bool = True,
):
    """
    For each measurement in `measurements`, move the ENTIRE measurement folder
    from both data and validation trees to backup roots, for all machines whose
    names contain 'EPI' or 'TEC' (case-insensitive) and do NOT contain 'PV'.

    Example:
      Move:
        ../dataset_clean/TEC_48S/IN_fund/         -> ./backup_clean_dirs/TEC_48S/IN_fund[_bakN]
        ../dataset_clean_validation/TEC_48S/IN_fund/ -> ./backup_validation_dirs/TEC_48S/IN_fund[_bakN]

    Notes:
      - Folders are moved (not deleted), preserving all files like:
          2024_<measurement>.csv.xz, 2024_<measurement>_missing.csv, 2024_<measurement>_stats.csv, etc.
      - Never overwrites: if destination exists, a suffix _bakN is appended to the *folder name*.
      - `dry_run=True` prints planned moves only.
    """
    if not os.path.isdir(root_clean):
        print(f"[ERROR] Data root not found: {root_clean}")
        return
    if not os.path.isdir(root_stats):
        print(f"[ERROR] Validation root not found: {root_stats}")
        return

    os.makedirs(backup_clean_root, exist_ok=True)
    os.makedirs(backup_stats_root, exist_ok=True)

    # Discover eligible machines (EPI or TEC, not PV)
    machines = sorted(
        d for d in os.listdir(root_clean)
        if os.path.isdir(os.path.join(root_clean, d))
        and ("pv" not in d.lower())
        and (("epi" in d.lower()) or ("tec" in d.lower()))
    )

    if not machines:
        print("[INFO] No EPI/TEC machines found (excluding PV).")
        return

    processed_clean = 0
    processed_stats = 0
    skipped_clean = 0
    skipped_stats = 0

    for m in machines:
        for meas in measurements:
            src_clean_dir = os.path.join(root_clean, m, meas)
            src_stats_dir = os.path.join(root_stats, m, meas)

            # Move DATA directory
            if os.path.isdir(src_clean_dir):
                if _move_dir(src_clean_dir, backup_clean_root, dry_run):
                    processed_clean += 1
            else:
                skipped_clean += 1

            # Move VALIDATION directory
            if os.path.isdir(src_stats_dir):
                if _move_dir(src_stats_dir, backup_stats_root, dry_run):
                    processed_stats += 1
            else:
                skipped_stats += 1

    print(
        f"\nSummary (folders): "
        f"data moved={processed_clean}, data missing/skip={skipped_clean}, "
        f"validation moved={processed_stats}, validation missing/skip={skipped_stats}"
    )


# --- Example usage (uncomment to run) ---
to_remove = [
    "I1_fund", "I2_fund", "I3_fund", "IN_fund", "U1_fund", "U2_fund", "U3_fund", "U12_fund", "U23_fund", "U31_fund",
    "cos_phi1", "cos_phi1_f", "cos_phi2", "cos_phi2_f", "cos_phi3", "cos_phi3_f",
    "Angle_UI1", "Angle_UI1_f", "Angle_UI2", "Angle_UI2_f", "Angle_UI3", "Angle_UI3_f",
    "PF_total_arith", "PF_total_arith_f",
    'Q_total_arith', 'Q_total_arith_f', 'S_total_arith', 'S_total_arith_f',
    'Q_total_vec', 'S_total_vec'
    ]
# Preview what would be moved:
move_measurement_folders_for_epi_and_tec(to_remove,root_clean="../dataset_clean",root_stats="../dataset_clean_validation",
                                        backup_clean_root="./backup_clean_dirs", backup_stats_root="./backup_validation_dirs",dry_run=False,)


[OK]  MOVED DIR ../dataset_clean_validation\EPI_ChipSaw\Q_total_vec  ->  ./backup_validation_dirs\EPI_ChipSaw\Q_total_vec
[OK]  MOVED DIR ../dataset_clean_validation\EPI_ChipSaw\S_total_vec  ->  ./backup_validation_dirs\EPI_ChipSaw\S_total_vec
[OK]  MOVED DIR ../dataset_clean_validation\EPI_HighTempOven\Q_total_vec  ->  ./backup_validation_dirs\EPI_HighTempOven\Q_total_vec
[OK]  MOVED DIR ../dataset_clean_validation\EPI_HighTempOven\S_total_vec  ->  ./backup_validation_dirs\EPI_HighTempOven\S_total_vec
[OK]  MOVED DIR ../dataset_clean_validation\EPI_PumpStation1\Q_total_vec  ->  ./backup_validation_dirs\EPI_PumpStation1\Q_total_vec
[OK]  MOVED DIR ../dataset_clean_validation\EPI_PumpStation1\S_total_vec  ->  ./backup_validation_dirs\EPI_PumpStation1\S_total_vec
[OK]  MOVED DIR ../dataset_clean_validation\EPI_PumpStation2\Q_total_vec  ->  ./backup_validation_dirs\EPI_PumpStation2\Q_total_vec
[OK]  MOVED DIR ../dataset_clean_validation\EPI_PumpStation2\S_total_vec  ->  ./backup_validatio

### Rename

In [24]:
from pathlib import Path
import shutil
import lzma
import os

def rename_measurement(root_dir, old_meas_name, new_meas_name, test=True):
    """
    Rename a measurement folder and all contained files for every machine,
    and update the compressed CSV header from old_meas_name -> new_meas_name.

    Layout:
        <root>/<MACHINE>/<MEASUREMENT>/<YEAR>_<MEASUREMENT>.csv.xz

    CSVs have exactly two columns: 'WsDateTime' and '<MEASUREMENT>'.

    Parameters
    ----------
    root_dir : str | Path
        Root folder containing machine subfolders.
    old_meas_name : str
        Old measurement name (e.g., "PF_total_vec").
    new_meas_name : str
        New measurement name (e.g., "PF_total").
    test : bool
        If True, only reads and prints planned changes + stats (no filesystem writes).
    """
    root = Path(root_dir).resolve()
    if not root.exists() or not root.is_dir():
        raise ValueError(f"Root directory not found or not a directory: {root}")

    # Find all measurement directories named exactly old_meas_name
    meas_dirs = [p for p in root.rglob(old_meas_name) if p.is_dir()]
    if not meas_dirs:
        print(f"No measurement directories named '{old_meas_name}' found under {root}.")
        return

    print(f"{'DRY-RUN' if test else 'EXECUTE'}: Renaming '{old_meas_name}' -> '{new_meas_name}' under {root}")
    print("-" * 100)

    for mdir in meas_dirs:
        machine_dir = mdir.parent
        target_mdir = machine_dir / new_meas_name

        # 1) Process files inside the measurement directory
        for f in sorted(mdir.iterdir()):
            if not f.is_file():
                continue

            old_name = f.name
            new_name = old_name.replace(f"_{old_meas_name}", f"_{new_meas_name}")
            new_path_same_dir = f.with_name(new_name)

            print(f"\nFILE:")
            print(f"  path : {f}")
            if new_name != old_name:
                print(f"  rename -> {new_path_same_dir}")

            # Stream read + (in real mode) stream write with updated header
            cnt = 0
            ssum = 0.0
            vmin = None
            vmax = None

            with lzma.open(f, mode="rt", encoding="utf-8", newline="") as fin:
                header = fin.readline()
                if not header:
                    raise ValueError(f"Empty file or missing header: {f}")

                new_header = header.replace(old_meas_name, new_meas_name, 1)
                print(f"  header: '{header.strip()}'")
                print(f"  header-> '{new_header.strip()}'")

                if not test:
                    # Write to a temporary file in the same directory for atomic replace
                    tmp_out = f.with_name(new_name + ".tmp")
                    with lzma.open(tmp_out, mode="wt", encoding="utf-8", newline="") as fout:
                        fout.write(new_header)
                        for line in fin:
                            fout.write(line)
                            if not line or line == "\n":
                                continue
                            parts = line.rstrip("\n").split(",", 1)
                            if len(parts) != 2:
                                continue
                            val_str = parts[1]
                            if not val_str or val_str.isspace():
                                continue
                            try:
                                val = float(val_str)
                            except ValueError:
                                continue
                            cnt += 1
                            ssum += val
                            vmin = val if vmin is None else min(vmin, val)
                            vmax = val if vmax is None else max(vmax, val)
                else:
                    # Test mode: just compute stats
                    for line in fin:
                        if not line or line == "\n":
                            continue
                        parts = line.rstrip("\n").split(",", 1)
                        if len(parts) != 2:
                            continue
                        val_str = parts[1]
                        if not val_str or val_str.isspace():
                            continue
                        try:
                            val = float(val_str)
                        except ValueError:
                            continue
                        cnt += 1
                        ssum += val
                        vmin = val if vmin is None else min(vmin, val)
                        vmax = val if vmax is None else max(vmax, val)

            # IMPORTANT: only touch files after closing 'fin' to avoid Windows locks
            mean = (ssum / cnt) if cnt > 0 else float("nan")
            print(f"  stats : count={cnt}  min={vmin}  max={vmax}  mean={mean}")

            if not test:
                tmp_out = f.with_name(new_name + ".tmp")
                final_target = new_path_same_dir
                # Move temp to final target atomically (overwrites if exists)
                os.replace(tmp_out, final_target)
                # If final name differs from source, remove the original now that it's closed
                if final_target != f:
                    f.unlink()

        # 2) Rename or merge the measurement directory itself (no swallowing of errors)
        print(f"\nDIR : {mdir}  ->  {target_mdir}")
        if not test:
            if target_mdir.exists():
                for item in mdir.iterdir():
                    shutil.move(str(item), str(target_mdir / item.name))
                mdir.rmdir()  # hard-fail if not empty
            else:
                mdir.rename(target_mdir)

    print("\n" + "-" * 100)
    print("Done.")

# --- Example usage ---
# DRY RUN:
rename_measurement("../dataset_clean", old_meas_name="Q_total_vec", new_meas_name="Q_total", test=False)

EXECUTE: Renaming 'Q_total_vec' -> 'Q_total' under D:\EnergyDataset\dataset_clean
----------------------------------------------------------------------------------------------------

FILE:
  path : D:\EnergyDataset\dataset_clean\EPI_ChipPress\Q_total_vec\2018_Q_total_vec.csv.xz
  rename -> D:\EnergyDataset\dataset_clean\EPI_ChipPress\Q_total_vec\2018_Q_total.csv.xz
  header: 'WsDateTime,Q_total_vec'
  header-> 'WsDateTime,Q_total'
  stats : count=5640235  min=-13710.0  max=13990.0  mean=-31.08678095859481

FILE:
  path : D:\EnergyDataset\dataset_clean\EPI_ChipPress\Q_total_vec\2019_Q_total_vec.csv.xz
  rename -> D:\EnergyDataset\dataset_clean\EPI_ChipPress\Q_total_vec\2019_Q_total.csv.xz
  header: 'WsDateTime,Q_total_vec'
  header-> 'WsDateTime,Q_total'
  stats : count=4295049  min=-4290.0  max=13620.0  mean=55.351938941790884

FILE:
  path : D:\EnergyDataset\dataset_clean\EPI_ChipPress\Q_total_vec\2020_Q_total_vec.csv.xz
  rename -> D:\EnergyDataset\dataset_clean\EPI_ChipPress\Q_tota

In [25]:
from pathlib import Path
import lzma
import os
from array import array
import math

def build_validation_files(
    root_data_dir,
    validation_root_dir,
    old_meas_name,
    new_meas_name,
    step_seconds: int = 5,
    test: bool = True,
):
    """
    Rebuild per-file validation CSVs for the *renamed* measurement.

    For every file at:
        <root_data_dir>/<MACHINE>/<new_meas_name>/<YEAR>_<new_meas_name>.csv.xz

    produce two outputs at:
        <validation_root_dir>/<MACHINE>/<new_meas_name>/<YEAR>_<new_meas_name>_missing.csv
        <validation_root_dir>/<MACHINE>/<new_meas_name>/<YEAR>_<new_meas_name>_stats.csv

    If the target validation folder already exists, rename it to *_old* (hard-fail if that exists).

    test=True:
      - Does NOT write anything.
      - Prints whether dir would be renamed to *_old* and created.
      - Prints the full CSV (header + row) that would be saved for each file.

    Assumptions:
      - CSV has exactly two columns: 'WsDateTime,<MEAS_NEW>'.
      - Sampling step = 5 sec by default (configurable).
    """
    root_data = Path(root_data_dir).resolve()
    root_val  = Path(validation_root_dir).resolve()

    if not root_data.exists() or not root_data.is_dir():
        raise ValueError(f"Data root not found or not a directory: {root_data}")
    if not root_val.exists():
        if test:
            print(f"[TEST] Would create validation root: {root_val}")
        else:
            root_val.mkdir(parents=True, exist_ok=False)

    # Find all measurement directories named exactly new_meas_name
    meas_dirs = [p for p in root_data.rglob(new_meas_name) if p.is_dir()]
    if not meas_dirs:
        print(f"No measurement directories named '{new_meas_name}' found under {root_data}.")
        return

    print(f"{'DRY-RUN' if test else 'EXECUTE'}: Building validation for '{new_meas_name}'")
    print("-" * 100)

    # Helper: percentile from sorted array('d')
    def percentile_from_sorted(sorted_vals, q: float) -> float:
        """q in [0,100]. Uses linear interpolation between closest ranks."""
        n = len(sorted_vals)
        if n == 0:
            return float('nan')
        if n == 1:
            return sorted_vals[0]
        pos = (q / 100.0) * (n - 1)
        lo = int(math.floor(pos))
        hi = int(math.ceil(pos))
        if lo == hi:
            return sorted_vals[lo]
        frac = pos - lo
        return sorted_vals[lo] * (1.0 - frac) + sorted_vals[hi] * frac

    # Boundaries in steps
    ONE_STEP = 1
    steps_30s = int(30 // step_seconds)
    steps_1m  = int(60 // step_seconds)
    steps_15m = int(15 * 60 // step_seconds)
    steps_1h  = int(60 * 60 // step_seconds)
    steps_24h = int(24 * 60 * 60 // step_seconds)
    steps_7d  = int(7 * 24 * 60 * 60 // step_seconds)
    steps_1mo = int(30 * 24 * 60 * 60 // step_seconds)  # 30-day month

    def gap_bucket(L):
        """Return bucket name for a NaN run of length L steps."""
        if L == ONE_STEP:
            return "nan_1_step"
        if 2 <= L <= steps_30s:
            return "nan_5s_30s"
        if steps_30s < L <= steps_1m:
            return "nan_30s_1m"
        if steps_1m < L <= steps_15m:
            return "nan_1m_15m"
        if steps_15m < L <= steps_1h:
            return "nan_15m_1h"
        if steps_1h < L <= steps_24h:
            return "nan_1h_24h"
        if steps_24h < L <= steps_7d:
            return "nan_1d_7d"
        if steps_7d < L <= steps_1mo:
            return "nan_1w_1mo"
        return "nan_1mo_inf"

    # Constant headers (match your examples)
    missing_header = ",".join([
        "file_path","n_total","n_nans","n_zeros","n_nans_start","n_nans_end","n_nans_middle",
        "n_missing_gaps","missing_gap_total_sec",
        "nan_1_step_count","nan_1_step_steps","nan_1_step_pct",
        "nan_5s_30s_count","nan_5s_30s_steps","nan_5s_30s_pct",
        "nan_30s_1m_count","nan_30s_1m_steps","nan_30s_1m_pct",
        "nan_1m_15m_count","nan_1m_15m_steps","nan_1m_15m_pct",
        "nan_15m_1h_count","nan_15m_1h_steps","nan_15m_1h_pct",
        "nan_1h_24h_count","nan_1h_24h_steps","nan_1h_24h_pct",
        "nan_1d_7d_count","nan_1d_7d_steps","nan_1d_7d_pct",
        "nan_1w_1mo_count","nan_1w_1mo_steps","nan_1w_1mo_pct",
        "nan_1mo_inf_count","nan_1mo_inf_steps","nan_1mo_inf_pct",
    ])
    stats_header = ",".join([
        "file_path","min","max","mean","std","q01","q05","q25","q50","q75","q95","q99"
    ])

    for mdir in meas_dirs:
        machine = mdir.parent.name
        out_dir = root_val / machine / new_meas_name
        out_dir_old = out_dir.with_name(out_dir.name + "_old")

        # Ensure validation directory state
        if out_dir.exists():
            if test:
                print(f"[TEST] Validation dir exists for {machine}/{new_meas_name}: {out_dir}")
                print(f"[TEST] Would rename -> {out_dir_old}")
            else:
                out_dir.rename(out_dir_old)
        if not out_dir.exists():
            if test:
                print(f"[TEST] Would create: {out_dir}")
            else:
                out_dir.mkdir(parents=True, exist_ok=False)

        # Iterate files
        for f in sorted(mdir.glob("*.csv.xz")):
            stem_no_ext = f.name[:-7]  # remove ".csv.xz"
            out_missing = out_dir / f"{stem_no_ext}_missing.csv"
            out_stats   = out_dir / f"{stem_no_ext}_stats.csv"

            # Compute streaming stats/missing
            file_rel = Path(root_data.name) / f.parent.relative_to(root_data) / f.name
            file_rel_str = file_rel.as_posix()  # match your style

            n_total = 0
            n_nans  = 0
            n_zeros = 0

            nan_run_lengths = []
            in_nan_run = False
            cur_run_len = 0

            # Welford for mean/std
            mean = 0.0
            M2   = 0.0
            n_valid = 0

            vmin = None
            vmax = None
            vals = array('d')

            with lzma.open(f, mode="rt", encoding="utf-8", newline="") as fin:
                header = fin.readline()
                if not header:
                    raise ValueError(f"Empty file or missing header: {f}")
                hdr = header.strip().split(",")
                if len(hdr) != 2 or hdr[0] != "WsDateTime" or hdr[1] != new_meas_name:
                    raise ValueError(f"Unexpected header in {f}: '{header.strip()}'")

                for line in fin:
                    if not line:
                        continue
                    parts = line.rstrip("\n").split(",", 1)
                    if len(parts) != 2:
                        continue
                    n_total += 1
                    val_str = parts[1]
                    try:
                        val = float(val_str)
                    except ValueError:
                        val = float('nan')

                    if math.isnan(val):
                        n_nans += 1
                        if in_nan_run:
                            cur_run_len += 1
                        else:
                            in_nan_run = True
                            cur_run_len = 1
                        continue

                    if in_nan_run:
                        nan_run_lengths.append(cur_run_len)
                        in_nan_run = False
                        cur_run_len = 0

                    if val == 0.0:
                        n_zeros += 1

                    n_valid += 1
                    delta = val - mean
                    mean += delta / n_valid
                    M2   += delta * (val - mean)

                    if vmin is None or val < vmin:
                        vmin = val
                    if vmax is None or val > vmax:
                        vmax = val

                    vals.append(val)

                if in_nan_run:
                    nan_run_lengths.append(cur_run_len)

            # Determine NaNs at start/end
            n_nans_start = 0
            n_nans_end   = 0
            n_nans_middle = n_nans

            if nan_run_lengths:
                # Check first row
                with lzma.open(f, mode="rt", encoding="utf-8", newline="") as fin:
                    _ = fin.readline()
                    first_line = fin.readline()
                    first_nan = False
                    if first_line:
                        parts = first_line.rstrip("\n").split(",", 1)
                        try:
                            first_nan = (len(parts) == 2 and math.isnan(float(parts[1])))
                        except ValueError:
                            first_nan = True

                # Check last row
                last_nan = False
                with lzma.open(f, mode="rt", encoding="utf-8", newline="") as fin:
                    _ = fin.readline()
                    last_val_str = None
                    for line in fin:
                        if not line or line == "\n":
                            continue
                        parts = line.rstrip("\n").split(",", 1)
                        if len(parts) == 2:
                            last_val_str = parts[1]
                    if last_val_str is not None:
                        try:
                            last_nan = math.isnan(float(last_val_str))
                        except ValueError:
                            last_nan = True

                if first_nan:
                    n_nans_start = nan_run_lengths[0]
                if last_nan:
                    n_nans_end = nan_run_lengths[-1]
                n_nans_middle = n_nans - n_nans_start - n_nans_end

            n_missing_gaps = len(nan_run_lengths)
            missing_gap_total_steps = sum(nan_run_lengths)
            missing_gap_total_sec = missing_gap_total_steps * step_seconds

            bucket_names = [
                "nan_1_step","nan_5s_30s","nan_30s_1m","nan_1m_15m",
                "nan_15m_1h","nan_1h_24h","nan_1d_7d","nan_1w_1mo","nan_1mo_inf"
            ]
            bucket_counts = {k: 0 for k in bucket_names}
            bucket_steps  = {k: 0 for k in bucket_names}
            for L in nan_run_lengths:
                b = gap_bucket(L)
                bucket_counts[b] += 1
                bucket_steps[b]  += L

            def pct(steps):
                return (steps * 100.0 / n_nans) if n_nans > 0 else 0.0

            if n_valid > 1:
                variance = M2 / n_valid
                std = math.sqrt(variance)
            else:
                std = float('nan')

            vals_sorted = sorted(vals)
            q01 = percentile_from_sorted(vals_sorted, 1.0)
            q05 = percentile_from_sorted(vals_sorted, 5.0)
            q25 = percentile_from_sorted(vals_sorted, 25.0)
            q50 = percentile_from_sorted(vals_sorted, 50.0)
            q75 = percentile_from_sorted(vals_sorted, 75.0)
            q95 = percentile_from_sorted(vals_sorted, 95.0)
            q99 = percentile_from_sorted(vals_sorted, 99.0)

            # Compose CSV rows (as strings)
            missing_row = ",".join(map(str, [
                file_rel_str, n_total, n_nans, n_zeros, n_nans_start, n_nans_end, n_nans_middle,
                n_missing_gaps, missing_gap_total_sec,
                bucket_counts["nan_1_step"], bucket_steps["nan_1_step"], round(pct(bucket_steps["nan_1_step"]), 2),
                bucket_counts["nan_5s_30s"], bucket_steps["nan_5s_30s"], round(pct(bucket_steps["nan_5s_30s"]), 2),
                bucket_counts["nan_30s_1m"], bucket_steps["nan_30s_1m"], round(pct(bucket_steps["nan_30s_1m"]), 2),
                bucket_counts["nan_1m_15m"], bucket_steps["nan_1m_15m"], round(pct(bucket_steps["nan_1m_15m"]), 2),
                bucket_counts["nan_15m_1h"], bucket_steps["nan_15m_1h"], round(pct(bucket_steps["nan_15m_1h"]), 2),
                bucket_counts["nan_1h_24h"], bucket_steps["nan_1h_24h"], round(pct(bucket_steps["nan_1h_24h"]), 2),
                bucket_counts["nan_1d_7d"], bucket_steps["nan_1d_7d"], round(pct(bucket_steps["nan_1d_7d"]), 2),
                bucket_counts["nan_1w_1mo"], bucket_steps["nan_1w_1mo"], round(pct(bucket_steps["nan_1w_1mo"]), 2),
                bucket_counts["nan_1mo_inf"], bucket_steps["nan_1mo_inf"], round(pct(bucket_steps["nan_1mo_inf"]), 2),
            ]))

            stats_row = ",".join(map(str, [
                file_rel_str,
                vmin if vmin is not None else float('nan'),
                vmax if vmax is not None else float('nan'),
                mean if n_valid > 0 else float('nan'),
                std,
                q01,q05,q25,q50,q75,q95,q99
            ]))

            if test:
                print(f"\n[TEST] Would create files for: {machine}/{new_meas_name}/{f.name}")
                print(f"[TEST] MISSING path -> {out_missing}")
                print(f"[TEST] STATS   path -> {out_stats}")
                print("[TEST] --- MISSING CSV CONTENT ---")
                display(missing_header)
                display(missing_row)
                print("[TEST] --- STATS CSV CONTENT ---")
                display(stats_header)
                display(stats_row)
                continue

            # REAL MODE: write atomically
            tmp_missing = out_missing.with_suffix(out_missing.suffix + ".tmp")
            tmp_stats   = out_stats.with_suffix(out_stats.suffix + ".tmp")

            with open(tmp_missing, "w", encoding="utf-8", newline="") as fo:
                fo.write(missing_header + "\n")
                fo.write(missing_row + "\n")
            with open(tmp_stats, "w", encoding="utf-8", newline="") as fo:
                fo.write(stats_header + "\n")
                fo.write(stats_row + "\n")

            os.replace(tmp_missing, out_missing)
            os.replace(tmp_stats, out_stats)

            print(f"OK  -> {out_missing.name} and {out_stats.name} for {machine}/{new_meas_name}")

    print("\n" + "-" * 100)
    print("Done.")

# -------- Example usage --------
# TEST (no writes): prints planned renames/paths AND full CSV contents that would be saved
build_validation_files(
    root_data_dir=r"D:\EnergyDataset\dataset_clean",
    validation_root_dir=r"D:\EnergyDataset\dataset_clean_validation",
    old_meas_name="Q_total_vec",
    new_meas_name="Q_total",
    step_seconds=5,
    test=False,
)


EXECUTE: Building validation for 'Q_total'
----------------------------------------------------------------------------------------------------
OK  -> 2018_Q_total_missing.csv and 2018_Q_total_stats.csv for EPI_ChipPress/Q_total
OK  -> 2019_Q_total_missing.csv and 2019_Q_total_stats.csv for EPI_ChipPress/Q_total
OK  -> 2020_Q_total_missing.csv and 2020_Q_total_stats.csv for EPI_ChipPress/Q_total
OK  -> 2021_Q_total_missing.csv and 2021_Q_total_stats.csv for EPI_ChipPress/Q_total
OK  -> 2022_Q_total_missing.csv and 2022_Q_total_stats.csv for EPI_ChipPress/Q_total
OK  -> 2023_Q_total_missing.csv and 2023_Q_total_stats.csv for EPI_ChipPress/Q_total
OK  -> 2024_Q_total_missing.csv and 2024_Q_total_stats.csv for EPI_ChipPress/Q_total
OK  -> 2018_Q_total_missing.csv and 2018_Q_total_stats.csv for EPI_ChipSaw/Q_total
OK  -> 2019_Q_total_missing.csv and 2019_Q_total_stats.csv for EPI_ChipSaw/Q_total
OK  -> 2020_Q_total_missing.csv and 2020_Q_total_stats.csv for EPI_ChipSaw/Q_total
OK  -> 2021_

In [16]:
from pathlib import Path
import lzma
import os
from array import array
import math

def build_validation_files(
    root_data_dir,
    validation_root_dir,
    old_meas_name,
    new_meas_name,
    step_seconds: int = 5,
    test: bool = True,
):
    """
    Rebuild per-file validation CSVs for the *renamed* measurement.

    For every file at:
        <root_data_dir>/<MACHINE>/<new_meas_name>/<YEAR>_<new_meas_name>.csv.xz

    produce two outputs at:
        <validation_root_dir>/<MACHINE>/<new_meas_name>/<YEAR>_<new_meas_name>_missing.csv
        <validation_root_dir>/<MACHINE>/<new_meas_name>/<YEAR>_<new_meas_name>_stats.csv

    If the target validation folder already exists, rename it to *_old* (hard-fail if that exists).

    Parameters
    ----------
    root_data_dir : str | Path
        Root of the (renamed) data, e.g. r"D:\\EnergyDataset\\dataset_clean"
    validation_root_dir : str | Path
        Root for validation outputs, e.g. r"D:\\EnergyDataset\\dataset_clean_validation"
    old_meas_name : str
        Previous measurement name (not used for scanning; here for bookkeeping).
    new_meas_name : str
        Current measurement name (directories must match this).
    step_seconds : int
        Sampling period in seconds (default 5).
    test : bool
        If True, do not write anything; only print what would happen.
    """
    root_data = Path(root_data_dir).resolve()
    root_val  = Path(validation_root_dir).resolve()

    if not root_data.exists() or not root_data.is_dir():
        raise ValueError(f"Data root not found or not a directory: {root_data}")
    if not root_val.exists():
        if test:
            print(f"[TEST] Would create validation root: {root_val}")
        else:
            root_val.mkdir(parents=True, exist_ok=False)

    # Find all measurement directories named exactly new_meas_name
    meas_dirs = [p for p in root_data.rglob(new_meas_name) if p.is_dir()]
    if not meas_dirs:
        print(f"No measurement directories named '{new_meas_name}' found under {root_data}.")
        return

    print(f"{'DRY-RUN' if test else 'EXECUTE'}: Building validation for '{new_meas_name}'")
    print("-" * 100)

    # Helper: percentile from sorted array('d')
    def percentile_from_sorted(sorted_vals, q: float) -> float:
        """q in [0,100]. Uses linear interpolation between closest ranks."""
        n = len(sorted_vals)
        if n == 0:
            return float('nan')
        if n == 1:
            return sorted_vals[0]
        # position in 0..n-1
        pos = (q / 100.0) * (n - 1)
        lo = int(math.floor(pos))
        hi = int(math.ceil(pos))
        if lo == hi:
            return sorted_vals[lo]
        frac = pos - lo
        return sorted_vals[lo] * (1.0 - frac) + sorted_vals[hi] * frac

    # Buckets (in *steps*) for NaN gap lengths; inclusive of lower bound, inclusive of upper for the upper symbol where noted
    ONE_STEP = 1
    steps_30s = int(30 // step_seconds)          # 30s boundary (e.g., 6 for 5s)
    steps_1m  = int(60 // step_seconds)          # 1 minute
    steps_15m = int(15 * 60 // step_seconds)
    steps_1h  = int(60 * 60 // step_seconds)
    steps_24h = int(24 * 60 * 60 // step_seconds)
    steps_7d  = int(7 * 24 * 60 * 60 // step_seconds)
    steps_1mo = int(30 * 24 * 60 * 60 // step_seconds)  # 30-day month

    def gap_bucket(L):
        """Return (bucket_name) for a NaN run of length L steps."""
        if L == ONE_STEP:
            return "nan_1_step"
        if 2 <= L <= steps_30s:
            return "nan_5s_30s"
        if steps_30s < L <= steps_1m:
            return "nan_30s_1m"
        if steps_1m < L <= steps_15m:
            return "nan_1m_15m"
        if steps_15m < L <= steps_1h:
            return "nan_15m_1h"
        if steps_1h < L <= steps_24h:
            return "nan_1h_24h"
        if steps_24h < L <= steps_7d:
            return "nan_1d_7d"
        if steps_7d < L <= steps_1mo:
            return "nan_1w_1mo"
        return "nan_1mo_inf"

    # Process each measurement directory (per machine)
    for mdir in meas_dirs:
        machine = mdir.parent.name
        # target validation dir for this machine+measurement
        out_dir = root_val / machine / new_meas_name
        out_dir_old = out_dir.with_name(out_dir.name + "_old")

        # Ensure target directory state
        if out_dir.exists():
            if test:
                print(f"[TEST] Validation dir exists for {machine}/{new_meas_name}: {out_dir}")
                print(f"[TEST] Would rename -> {out_dir_old}")
            else:
                out_dir.rename(out_dir_old)
        if not out_dir.exists():
            if test:
                print(f"[TEST] Would create: {out_dir}")
            else:
                out_dir.mkdir(parents=True, exist_ok=False)

        # Iterate all .csv.xz files inside this measurement directory
        for f in sorted(mdir.glob("*.csv.xz")):
            # Expect filename YEAR_MEAS.csv.xz
            stem_no_ext = f.name[:-7]  # strip ".csv.xz"
            # Output files
            out_missing = out_dir / f"{stem_no_ext}_missing.csv"
            out_stats   = out_dir / f"{stem_no_ext}_stats.csv"

            # Report in test mode
            if test:
                print(f"\n[TEST] Would create:")
                print(f"  MISSING -> {out_missing}")
                print(f"  STATS   -> {out_stats}")
                continue

            # ---------- REAL MODE: compute everything streaming ----------
            # Relative path string like 'dataset_clean\\MACHINE\\MEAS\\file.csv.xz'
            file_rel = Path(root_data.name) / f.parent.relative_to(root_data) / f.name
            file_rel_str = str(file_rel).replace("/", "\\")  # match windows-backslash examples

            # Stats accumulators
            n_total = 0           # total data rows
            n_nans  = 0           # total NaN steps in measurement column
            n_zeros = 0           # total zeros (non-NaN and == 0)

            # For start/end/middle counts we need contiguous runs of NaN
            nan_run_lengths = []  # list of run lengths (steps)
            in_nan_run = False
            cur_run_len = 0

            # For mean/std (Welford)
            mean = 0.0
            M2   = 0.0
            n_valid = 0

            # For min/max and quantiles
            vmin = None
            vmax = None
            vals = array('d')  # compact storage for exact quantiles

            with lzma.open(f, mode="rt", encoding="utf-8", newline="") as fin:
                header = fin.readline()
                if not header:
                    raise ValueError(f"Empty file or missing header: {f}")
                # Quick sanity: ensure new column present
                hdr = header.strip().split(",")
                if len(hdr) != 2 or hdr[0] != "WsDateTime" or hdr[1] != new_meas_name:
                    raise ValueError(f"Unexpected header in {f}: '{header.strip()}'")

                for line in fin:
                    if not line:
                        continue
                    # Fast split into timestamp + value
                    parts = line.rstrip("\n").split(",", 1)
                    if len(parts) != 2:
                        continue  # skip malformed row
                    n_total += 1
                    val_str = parts[1]

                    # Parse float; treat non-parsable as NaN
                    try:
                        val = float(val_str)
                    except ValueError:
                        val = float('nan')

                    if math.isnan(val):
                        n_nans += 1
                        if in_nan_run:
                            cur_run_len += 1
                        else:
                            in_nan_run = True
                            cur_run_len = 1
                        continue

                    # non-NaN value -> close any open nan run
                    if in_nan_run:
                        nan_run_lengths.append(cur_run_len)
                        in_nan_run = False
                        cur_run_len = 0

                    # zeros
                    if val == 0.0:
                        n_zeros += 1

                    # Welford updates
                    n_valid += 1
                    delta = val - mean
                    mean += delta / n_valid
                    M2   += delta * (val - mean)

                    # min/max
                    if vmin is None or val < vmin:
                        vmin = val
                    if vmax is None or val > vmax:
                        vmax = val

                    # exact quantiles storage
                    vals.append(val)

                # If file ends with an open NaN run, close it
                if in_nan_run:
                    nan_run_lengths.append(cur_run_len)

            # Start/end/middle NaNs
            n_nans_start = 0
            n_nans_end   = 0
            n_nans_middle = n_nans

            if nan_run_lengths:
                # We need to know if the *first rows* and *last rows* were NaN;
                # to deduce that, re-scan first and last lines quickly (cheap: open again but read minimal)
                with lzma.open(f, mode="rt", encoding="utf-8", newline="") as fin:
                    _ = fin.readline()  # header
                    first_line = fin.readline()
                    first_nan = False
                    if first_line:
                        parts = first_line.rstrip("\n").split(",", 1)
                        try:
                            first_nan = (len(parts) == 2 and math.isnan(float(parts[1])))
                        except ValueError:
                            first_nan = True

                # To detect trailing NaNs, read last non-empty line:
                # (we will scan once more but only keeping last few chars)
                last_nan = False
                with lzma.open(f, mode="rt", encoding="utf-8", newline="") as fin:
                    _ = fin.readline()
                    last_val_str = None
                    for line in fin:
                        if not line or line == "\n":
                            continue
                        parts = line.rstrip("\n").split(",", 1)
                        if len(parts) == 2:
                            last_val_str = parts[1]
                    if last_val_str is not None:
                        try:
                            last_nan = math.isnan(float(last_val_str))
                        except ValueError:
                            last_nan = True

                # Assign start/end using the first and last run if applicable
                if first_nan:
                    n_nans_start = nan_run_lengths[0]
                if last_nan:
                    n_nans_end = nan_run_lengths[-1]
                n_nans_middle = n_nans - n_nans_start - n_nans_end

            n_missing_gaps = len(nan_run_lengths)
            missing_gap_total_steps = sum(nan_run_lengths)
            missing_gap_total_sec = missing_gap_total_steps * step_seconds

            # Bucketize NaN runs
            bucket_names = [
                "nan_1_step","nan_5s_30s","nan_30s_1m","nan_1m_15m",
                "nan_15m_1h","nan_1h_24h","nan_1d_7d","nan_1w_1mo","nan_1mo_inf"
            ]
            bucket_counts = {k: 0 for k in bucket_names}
            bucket_steps  = {k: 0 for k in bucket_names}

            for L in nan_run_lengths:
                b = gap_bucket(L)
                bucket_counts[b] += 1
                bucket_steps[b]  += L

            # Percentages relative to total missing steps (NaN steps)
            def pct(steps):
                return (steps * 100.0 / n_nans) if n_nans > 0 else 0.0

            # Final stats
            if n_valid > 1:
                variance = M2 / n_valid  # population variance
                std = math.sqrt(variance)
            else:
                std = float('nan')

            # Quantiles (exact) — sort the compact array
            vals_sorted = sorted(vals)
            q01 = percentile_from_sorted(vals_sorted, 1.0)
            q05 = percentile_from_sorted(vals_sorted, 5.0)
            q25 = percentile_from_sorted(vals_sorted, 25.0)
            q50 = percentile_from_sorted(vals_sorted, 50.0)
            q75 = percentile_from_sorted(vals_sorted, 75.0)
            q95 = percentile_from_sorted(vals_sorted, 95.0)
            q99 = percentile_from_sorted(vals_sorted, 99.0)

            # ---------- WRITE OUTPUT FILES ----------
            # Missing file (single-row CSV with header)
            missing_header = ",".join([
                "file_path","n_total","n_nans","n_zeros","n_nans_start","n_nans_end","n_nans_middle",
                "n_missing_gaps","missing_gap_total_sec",
                "nan_1_step_count","nan_1_step_steps","nan_1_step_pct",
                "nan_5s_30s_count","nan_5s_30s_steps","nan_5s_30s_pct",
                "nan_30s_1m_count","nan_30s_1m_steps","nan_30s_1m_pct",
                "nan_1m_15m_count","nan_1m_15m_steps","nan_1m_15m_pct",
                "nan_15m_1h_count","nan_15m_1h_steps","nan_15m_1h_pct",
                "nan_1h_24h_count","nan_1h_24h_steps","nan_1h_24h_pct",
                "nan_1d_7d_count","nan_1d_7d_steps","nan_1d_7d_pct",
                "nan_1w_1mo_count","nan_1w_1mo_steps","nan_1w_1mo_pct",
                "nan_1mo_inf_count","nan_1mo_inf_steps","nan_1mo_inf_pct",
            ])

            missing_row = ",".join(map(str, [
                file_rel_str, n_total, n_nans, n_zeros, n_nans_start, n_nans_end, n_nans_middle,
                n_missing_gaps, missing_gap_total_sec,
                bucket_counts["nan_1_step"], bucket_steps["nan_1_step"], round(pct(bucket_steps["nan_1_step"]), 2),
                bucket_counts["nan_5s_30s"], bucket_steps["nan_5s_30s"], round(pct(bucket_steps["nan_5s_30s"]), 2),
                bucket_counts["nan_30s_1m"], bucket_steps["nan_30s_1m"], round(pct(bucket_steps["nan_30s_1m"]), 2),
                bucket_counts["nan_1m_15m"], bucket_steps["nan_1m_15m"], round(pct(bucket_steps["nan_1m_15m"]), 2),
                bucket_counts["nan_15m_1h"], bucket_steps["nan_15m_1h"], round(pct(bucket_steps["nan_15m_1h"]), 2),
                bucket_counts["nan_1h_24h"], bucket_steps["nan_1h_24h"], round(pct(bucket_steps["nan_1h_24h"]), 2),
                bucket_counts["nan_1d_7d"], bucket_steps["nan_1d_7d"], round(pct(bucket_steps["nan_1d_7d"]), 2),
                bucket_counts["nan_1w_1mo"], bucket_steps["nan_1w_1mo"], round(pct(bucket_steps["nan_1w_1mo"]), 2),
                bucket_counts["nan_1mo_inf"], bucket_steps["nan_1mo_inf"], round(pct(bucket_steps["nan_1mo_inf"]), 2),
            ]))

            # Stats file (single-row CSV with header)
            stats_header = ",".join([
                "file_path","min","max","mean","std","q01","q05","q25","q50","q75","q95","q99"
            ])
            stats_row = ",".join(map(str, [
                file_rel_str,
                vmin if vmin is not None else float('nan'),
                vmax if vmax is not None else float('nan'),
                mean if n_valid > 0 else float('nan'),
                std,
                q01,q05,q25,q50,q75,q95,q99
            ]))

            # Ensure parent exists (it does), then write both files atomically
            tmp_missing = out_missing.with_suffix(out_missing.suffix + ".tmp")
            tmp_stats   = out_stats.with_suffix(out_stats.suffix + ".tmp")

            with open(tmp_missing, "w", encoding="utf-8", newline="") as fo:
                fo.write(missing_header + "\n")
                fo.write(missing_row + "\n")
            with open(tmp_stats, "w", encoding="utf-8", newline="") as fo:
                fo.write(stats_header + "\n")
                fo.write(stats_row + "\n")

            os.replace(tmp_missing, out_missing)
            os.replace(tmp_stats, out_stats)

            print(f"OK  -> {out_missing.name} and {out_stats.name} for {machine}/{new_meas_name}")

    print("\n" + "-" * 100)
    print("Done.")

# -------- Example usage --------
# TEST (no writes): just shows renames/paths it would create
build_validation_files(
    root_data_dir=r"D:\EnergyDataset\dataset_clean",
    validation_root_dir=r"D:\EnergyDataset\dataset_clean_validation",
    old_meas_name="PF_total_vec",
    new_meas_name="PF_total",
    step_seconds=5,
    test=True,
)

# REAL (writes validation CSVs, renaming existing validation dirs to *_old* first):
# build_validation_files(
#     root_data_dir=r"D:\EnergyDataset\dataset_clean",
#     validation_root_dir=r"D:\EnergyDataset\dataset_clean_validation",
#     old_meas_name="PF_total_vec",
#     new_meas_name="PF_total",
#     step_seconds=5,
#     test=False,
# )


DRY-RUN: Building validation for 'PF_total'
----------------------------------------------------------------------------------------------------
[TEST] Validation dir exists for EPI_ChipPress/PF_total: D:\EnergyDataset\dataset_clean_validation\EPI_ChipPress\PF_total
[TEST] Would rename -> D:\EnergyDataset\dataset_clean_validation\EPI_ChipPress\PF_total_old

[TEST] Would create:
  MISSING -> D:\EnergyDataset\dataset_clean_validation\EPI_ChipPress\PF_total\2018_PF_total_missing.csv
  STATS   -> D:\EnergyDataset\dataset_clean_validation\EPI_ChipPress\PF_total\2018_PF_total_stats.csv

[TEST] Would create:
  MISSING -> D:\EnergyDataset\dataset_clean_validation\EPI_ChipPress\PF_total\2019_PF_total_missing.csv
  STATS   -> D:\EnergyDataset\dataset_clean_validation\EPI_ChipPress\PF_total\2019_PF_total_stats.csv

[TEST] Would create:
  MISSING -> D:\EnergyDataset\dataset_clean_validation\EPI_ChipPress\PF_total\2020_PF_total_missing.csv
  STATS   -> D:\EnergyDataset\dataset_clean_validation\EPI_

In [15]:
import pandas as pd
df = pd.read_csv(r"../dataset_clean\TEC_E30D2\PF_total\2024_PF_total.csv.xz")
df

Unnamed: 0,WsDateTime,PF_total
0,2024-01-01 00:00:00,-0.065
1,2024-01-01 00:00:05,-0.064
2,2024-01-01 00:00:10,-0.064
3,2024-01-01 00:00:15,-0.064
4,2024-01-01 00:00:20,-0.064
...,...,...
6324475,2024-12-31 23:59:35,-0.065
6324476,2024-12-31 23:59:40,-0.065
6324477,2024-12-31 23:59:45,-0.065
6324478,2024-12-31 23:59:50,-0.065


In [19]:
import pandas as pd
df = pd.read_csv(r"../dataset_clean\EPI_ChipPress\PF_total\2018_PF_total.csv.xz")
df

Unnamed: 0,WsDateTime,PF_total
0,2018-01-01 00:00:00,1.0
1,2018-01-01 00:00:05,1.0
2,2018-01-01 00:00:10,1.0
3,2018-01-01 00:00:15,1.0
4,2018-01-01 00:00:20,1.0
...,...,...
6307195,2018-12-31 23:59:35,
6307196,2018-12-31 23:59:40,
6307197,2018-12-31 23:59:45,
6307198,2018-12-31 23:59:50,


In [16]:
import os
import csv
import glob
import lzma
import shutil
from typing import List, Tuple

def rename_pf_total_vec(
    root_clean: str = "../dataset_clean",
    root_stats: str = "../dataset_clean_validation",
    backup_clean_root: str = "./backup_clean_dirs",
    backup_stats_root: str = "./backup_validation_dirs",
    dry_run: bool = True,
) -> None:
    """
    Rename measurement PF_total_vec -> PF_total across dataset_clean and dataset_clean_validation.

    What happens per machine that has <root_clean>/<machine>/PF_total_vec:
      1) BACKUP: move the entire measurement folder (both trees) into backup roots
         (mirror <machine>/<measurement>), with automatic _bakN suffix if needed.
      2) DATA: create <root_clean>/<machine>/PF_total and write an xz-compressed CSV with the
         header 'PF_total_vec' -> 'PF_total' (streamed; does not load whole file).
      3) VALIDATION: create <root_stats>/<machine>/PF_total; copy the two CSVs while updating
         their file_path column to point to dataset_clean/<machine>/PF_total/2024_PF_total.csv.xz
         and rename the files to ..._PF_total_missing.csv / ..._PF_total_stats.csv.

    Test mode:
      - If dry_run=True, nothing is moved or written; you get a precise preview of all planned ops.
    """
    # --- helpers --------------------------------------------------------------
    def _unique_dest_dir(dest_dir: str) -> str:
        if not os.path.exists(dest_dir):
            return dest_dir
        parent = os.path.dirname(dest_dir)
        base = os.path.basename(dest_dir)
        k = 1
        while True:
            cand = os.path.join(parent, f"{base}_bak{k}")
            if not os.path.exists(cand):
                return cand
            k += 1

    def _move_dir(src_dir: str, backup_root: str, dry: bool) -> Tuple[bool, str]:
        """Move whole dir to backup_root/<machine>/<measurement[_bakN]>. Return (planned/moved, dest_path)."""
        if not os.path.isdir(src_dir):
            return False, ""
        machine = os.path.basename(os.path.dirname(src_dir))
        meas = os.path.basename(src_dir)
        dest_parent = os.path.join(backup_root, machine)
        os.makedirs(dest_parent, exist_ok=True)
        dest_dir = _unique_dest_dir(os.path.join(dest_parent, meas))
        if dry:
            print(f"[DRY] MOVE DIR  {src_dir}  ->  {dest_dir}")
            return True, dest_dir
        try:
            shutil.move(src_dir, dest_dir)
            print(f"[OK ] MOVED DIR {src_dir}  ->  {dest_dir}")
            return True, dest_dir
        except Exception as e:
            print(f"[FAIL] MOVE DIR {src_dir} (error: {e})")
            return False, ""

    def _ensure_dir(path: str, dry: bool):
        if dry:
            print(f"[DRY] MKDIR     {path}")
        else:
            os.makedirs(path, exist_ok=True)

    def _write_xz_stream_with_header_rename(src_xz: str, dst_xz: str, old_col: str, new_col: str, dry: bool):
        if dry:
            print(f"[DRY] REWRITE XZ HEADER {src_xz} -> {dst_xz} ({old_col} -> {new_col})")
            return
        with lzma.open(src_xz, mode="rt", newline="") as fin, lzma.open(dst_xz, mode="wt", newline="") as fout:
            reader = csv.reader(fin)
            writer = csv.writer(fout)
            try:
                header = next(reader)
            except StopIteration:
                writer.writerow([])
                return
            header = [new_col if h == old_col else h for h in header]
            writer.writerow(header)
            for row in reader:
                writer.writerow(row)

    def _stream_rewrite_csv_file_path(src_csv: str, dst_csv: str, dry: bool):
        """
        Rewrites the 'file_path' column inside validation CSVs so that:
          .../dataset_clean/<machine>/PF_total_vec/2024_PF_total_vec.csv.xz
        becomes:
          .../dataset_clean/<machine>/PF_total/2024_PF_total.csv.xz

        Works regardless of slash-style (handles / and \).
        """
        if dry:
            print(f"[DRY] REWRITE CSV file_path {src_csv} -> {dst_csv}")
            return

        def replace_fp(val: str) -> str:
            # replace filename first, then the measurement segment
            out = val.replace("2024_PF_total_vec.csv.xz", "2024_PF_total.csv.xz")
            out = out.replace("\\PF_total_vec\\", "\\PF_total\\").replace("/PF_total_vec/", "/PF_total/")
            # fallbacks if edge cases
            out = out.replace("PF_total_vec/2024_PF_total.csv.xz", "PF_total/2024_PF_total.csv.xz")
            out = out.replace("PF_total_vec\\2024_PF_total.csv.xz", "PF_total\\2024_PF_total.csv.xz")
            return out

        with open(src_csv, "r", newline="") as fin, open(dst_csv, "w", newline="") as fout:
            reader = csv.DictReader(fin)
            fieldnames = reader.fieldnames or []
            writer = csv.DictWriter(fout, fieldnames=fieldnames)
            writer.writeheader()
            if "file_path" not in fieldnames:
                for row in reader:
                    writer.writerow(row)
                return
            for row in reader:
                row["file_path"] = replace_fp(row.get("file_path", ""))
                writer.writerow(row)

    # --- sanity checks --------------------------------------------------------
    if not os.path.isdir(root_clean):
        print(f"[ERROR] Data root not found: {root_clean}")
        return
    if not os.path.isdir(root_stats):
        print(f"[ERROR] Validation root not found: {root_stats}")
        return

    os.makedirs(backup_clean_root, exist_ok=True)
    os.makedirs(backup_stats_root, exist_ok=True)

    # discover machines that actually have PF_total_vec in dataset_clean
    machines: List[str] = sorted(
        d for d in os.listdir(root_clean)
        if os.path.isdir(os.path.join(root_clean, d)) and os.path.isdir(os.path.join(root_clean, d, "PF_total_vec"))
    )

    if not machines:
        print("[INFO] No machines with PF_total_vec found.")
        return

    total_done = 0
    for machine in machines:
        print(f"\n=== MACHINE {machine} ===")
        data_src_meas_dir = os.path.join(root_clean, machine, "PF_total_vec")
        stats_src_meas_dir = os.path.join(root_stats, machine, "PF_total_vec")

        # 1) Backup moves (or planned moves)
        moved_clean, clean_backup_dir = (False, "")
        moved_stats, stats_backup_dir = (False, "")

        if os.path.isdir(data_src_meas_dir):
            moved_clean, clean_backup_dir = _move_dir(data_src_meas_dir, backup_clean_root, dry_run)
        else:
            print(f"[WARN] No data measurement dir: {data_src_meas_dir}")

        if os.path.isdir(stats_src_meas_dir):
            moved_stats, stats_backup_dir = _move_dir(stats_src_meas_dir, backup_stats_root, dry_run)
        else:
            print(f"[WARN] No validation measurement dir: {stats_src_meas_dir}")

        if not moved_clean and not moved_stats:
            print("[INFO] Nothing to process for this machine.")
            continue

        # 2) DATA: Create PF_total folder and plan/perform rewrite
        data_dst_meas_dir = os.path.join(root_clean, machine, "PF_total")
        _ensure_dir(data_dst_meas_dir, dry_run)

        # Where to scan for source .xz? If dry_run, source is the original dir (not yet moved).
        # If real run, source is the backup dir (already moved).
        clean_scan_dir = data_src_meas_dir if dry_run else clean_backup_dir
        xz_candidates = []
        if clean_scan_dir:
            xz_candidates = sorted(glob.glob(os.path.join(clean_scan_dir, "*PF_total_vec*.csv.xz")))
        if not xz_candidates:
            # If still nothing, just inform (but don't warn in dry run if folder likely has files)
            msg_tag = "[INFO]" if dry_run else "[WARN]"
            print(f"{msg_tag} No .csv.xz found in {'source' if dry_run else 'backup'} data dir: {clean_scan_dir}")
        for src_xz in xz_candidates:
            base = os.path.basename(src_xz)
            if "_PF_total_vec" in base:
                dst_base = base.replace("_PF_total_vec", "_PF_total").replace("PF_total_vec.csv.xz", "PF_total.csv.xz")
            else:
                dst_base = base.replace("PF_total_vec", "PF_total")
            dst_xz = os.path.join(data_dst_meas_dir, dst_base)
            _write_xz_stream_with_header_rename(
                src_xz, dst_xz, old_col="PF_total_vec", new_col="PF_total", dry=dry_run
            )

        # 3) VALIDATION: Create PF_total folder and plan/perform CSV rewrites + renames
        stats_dst_meas_dir = os.path.join(root_stats, machine, "PF_total")
        _ensure_dir(stats_dst_meas_dir, dry_run)

        stats_scan_dir = stats_src_meas_dir if dry_run else stats_backup_dir
        csv_files = []
        if stats_scan_dir:
            csv_files = sorted(glob.glob(os.path.join(stats_scan_dir, "*.csv")))
        if not csv_files:
            msg_tag = "[INFO]" if dry_run else "[WARN]"
            print(f"{msg_tag} No .csv found in {'source' if dry_run else 'backup'} validation dir: {stats_scan_dir}")
        for src_csv in csv_files:
            base = os.path.basename(src_csv)
            dst_base = (base
                        .replace("_PF_total_vec_", "_PF_total_")
                        .replace("PF_total_vec_", "PF_total_")
                        .replace("_PF_total_vec", "_PF_total")
                        .replace("PF_total_vec", "PF_total"))
            dst_csv = os.path.join(stats_dst_meas_dir, dst_base)
            _stream_rewrite_csv_file_path(src_csv, dst_csv, dry=dry_run)

        total_done += 1

    print(f"\n[SUMMARY] Machines processed: {total_done} | dry_run={dry_run}")


# --- Example execution ---
# Test mode (no changes, full preview):
rename_pf_total_vec(
    root_clean="../dataset_clean",
    root_stats="../dataset_clean_validation",
    backup_clean_root="./backup_clean_dirs",
    backup_stats_root="./backup_validation_dirs",
    dry_run=False,
)



=== MACHINE EPI_ChipPress ===
[OK ] MOVED DIR ../dataset_clean\EPI_ChipPress\PF_total_vec  ->  ./backup_clean_dirs\EPI_ChipPress\PF_total_vec_bak2
[OK ] MOVED DIR ../dataset_clean_validation\EPI_ChipPress\PF_total_vec  ->  ./backup_validation_dirs\EPI_ChipPress\PF_total_vec_bak2

=== MACHINE EPI_ChipSaw ===
[OK ] MOVED DIR ../dataset_clean\EPI_ChipSaw\PF_total_vec  ->  ./backup_clean_dirs\EPI_ChipSaw\PF_total_vec
[OK ] MOVED DIR ../dataset_clean_validation\EPI_ChipSaw\PF_total_vec  ->  ./backup_validation_dirs\EPI_ChipSaw\PF_total_vec

=== MACHINE EPI_HighTempOven ===
[OK ] MOVED DIR ../dataset_clean\EPI_HighTempOven\PF_total_vec  ->  ./backup_clean_dirs\EPI_HighTempOven\PF_total_vec
[OK ] MOVED DIR ../dataset_clean_validation\EPI_HighTempOven\PF_total_vec  ->  ./backup_validation_dirs\EPI_HighTempOven\PF_total_vec

=== MACHINE EPI_PumpStation1 ===
[OK ] MOVED DIR ../dataset_clean\EPI_PumpStation1\PF_total_vec  ->  ./backup_clean_dirs\EPI_PumpStation1\PF_total_vec
[OK ] MOVED DIR ../d

KeyboardInterrupt: 

### Voltage Harmonics and THD

In [None]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_U_THD(measurement: str, root: str = "../dataset_clean", backup_dir: str = "./backup_original_csv_xz", dry_run: bool = False):
    """
    For all machines in `root` whose folder name contains 'TEC' (case-insensitive) and NOT 'PV',
    find ALL years available for the given `measurement` (e.g., 'THD_U1', 'THD_U2'), and
    divide the first numeric column by 10 (e.g., 400 -> 40), cast to float32, and save back
    to the ORIGINAL PATH with the same filename & compression (.csv.xz), after first MOVING
    the original file to a backup folder in the CURRENT directory.

    Guarantees:
      - Never overwrites: if a backup path already exists, it appends a numeric suffix.
      - New file uses the EXACT same relative path/filename under `root`, same column names,
        same compression ("xz"), but with rescaled first numeric column (float32).
      - If `dry_run=True`, it only prints what it *would* do.

    Prints a log line per file processed.
    """
    if not os.path.isdir(root):
        print(f"[ERROR] Root not found: {root}")
        return

    # discover TEC (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root)
        if os.path.isdir(os.path.join(root, d)) and ("tec" in d.lower()) and ("pv" not in d.lower())
    )

    if not machines:
        print("[INFO] No TEC machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = 0
    skipped = 0
    errors = 0

    for m in machines:
        m_dir = os.path.join(root, m, measurement)
        if not os.path.isdir(m_dir):
            continue

        for fname in os.listdir(m_dir):
            if not pat.match(fname):
                continue

            src_path = os.path.join(m_dir, fname)

            # Read CSV
            try:
                df = pd.read_csv(src_path)
            except Exception as e:
                print(f"[SKIP] {src_path}  (read error: {e})")
                errors += 1
                continue

            num = df.select_dtypes("number")
            if num.empty:
                print(f"[SKIP] {src_path}  (no numeric columns)")
                skipped += 1
                continue

            # Identify first numeric column name
            col = num.columns[0]

            # Prepare backup destination (unique, do not overwrite)
            # Mirror relative path under backup_dir for clarity (optional)
            rel_path = os.path.relpath(src_path, start=os.getcwd())
            safe_rel = rel_path.replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(safe_rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(src_path))

            # If backup_path exists, add suffix
            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)  # ext is ".xz"
                base_csv, _ = os.path.splitext(base)       # remove ".csv" -> base_csv
                k = 1
                while True:
                    candidate = f"{base_csv}_bak{k}.csv.xz"
                    if not os.path.exists(candidate):
                        backup_path = candidate
                        break
                    k += 1

            # Build new DataFrame with rescaled values (÷10), float32
            df_new = df.copy()
            try:
                scaled = (df_new[col].astype(float) / 10.0).astype(np.float32)
                df_new[col] = scaled
            except Exception as e:
                print(f"[SKIP] {src_path}  (scaling error in column '{col}': {e})")
                errors += 1
                continue

            # Move original to backup, then write new file at original path
            try:
                if dry_run:
                    print(f"[DRY] MOVE  {src_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {src_path}  (rescaled {measurement}.{col} ÷10, float32, compression='xz')")
                    try:
                        old_max = float(pd.to_numeric(df[col], errors='coerce').max())
                        new_max = float(pd.to_numeric(df_new[col], errors='coerce').max())
                        print(f"OLD max {old_max} -> NEW max {new_max}")
                    except Exception:
                        pass
                else:
                    os.makedirs(os.path.dirname(backup_path), exist_ok=True)
                    shutil.move(src_path, backup_path)
                    df_new.to_csv(src_path, index=False, compression="xz")

                print(f"[OK]   {src_path}  | col='{col}'  ÷10  float32  (backup: {backup_path})")
                processed += 1
            except Exception as e:
                print(f"[FAIL] {src_path}  (I/O error: {e})")
                errors += 1
                # Attempt to restore original if move already happened and write failed
                if not dry_run and (not os.path.exists(src_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, src_path)
                        print(f"[RESTORE] Original restored from backup for {src_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {src_path}: {e2}")

    print(f"\nSummary: processed={processed}, skipped={skipped}, errors={errors}")


# --- Example usage (uncomment to run) ---
rescale_needed = [
    "THD_U1", "THD_U12", "THD_U2", "THD_U23", "THD_U3", "THD_U31",
    "U1_h2", "U1_h3","U1_h4","U1_h5", "U12_h2", "U12_h3","U12_h4","U12_h5",
    "U2_h2", "U2_h3","U2_h4","U2_h5","U23_h2", "U23_h3","U23_h4","U23_h5",
    "U3_h2", "U3_h3","U3_h4","U3_h5","U31_h2", "U31_h3","U31_h4","U31_h5",
                  ]
for meas in rescale_needed:
    rescale_U_THD(meas, root="../dataset_clean", backup_dir="./backup_original_csv_xz", dry_run=True)


[DRY] MOVE  ../dataset_clean\TEC_48S\THD_U1\2024_THD_U1.csv.xz  ->  ./backup_original_csv_xz\__up__\dataset_clean\TEC_48S\THD_U1\2024_THD_U1.csv.xz
[DRY] WRITE ../dataset_clean\TEC_48S\THD_U1\2024_THD_U1.csv.xz  (rescaled THD_U1.THD_U1 ÷10, float32, compression='xz')
OLD max 102.0 -> NEW max 10.199999809265137
[OK]   ../dataset_clean\TEC_48S\THD_U1\2024_THD_U1.csv.xz  | col='THD_U1'  ÷10  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\TEC_48S\THD_U1\2024_THD_U1.csv.xz)
[DRY] MOVE  ../dataset_clean\TEC_CFST161\THD_U1\2024_THD_U1.csv.xz  ->  ./backup_original_csv_xz\__up__\dataset_clean\TEC_CFST161\THD_U1\2024_THD_U1.csv.xz
[DRY] WRITE ../dataset_clean\TEC_CFST161\THD_U1\2024_THD_U1.csv.xz  (rescaled THD_U1.THD_U1 ÷10, float32, compression='xz')
OLD max 74.0 -> NEW max 7.400000095367432
[OK]   ../dataset_clean\TEC_CFST161\THD_U1\2024_THD_U1.csv.xz  | col='THD_U1'  ÷10  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\TEC_CFST161\THD_U1\2024_THD_U1.csv.xz)


In [28]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_stats_div10_for_tec(
    measurement: str,
    root_clean: str = "../dataset_clean",
    root_stats: str = "../dataset_clean_validation",
    backup_dir: str = "./backup_stats_csv",
    dry_run: bool = False
):
    """
    For all machines in `root_clean` whose folder name contains 'TEC' (case-insensitive) and NOT 'PV',
    find ALL years available for the given `measurement` by data filenames:
        <root_clean>/<machine>/<measurement>/<year>_<measurement>.csv.xz
    then locate the corresponding stats CSV:
        <root_stats>/<machine>/<measurement>/<year>_<measurement>_stats.csv
    and divide ALL numeric stats columns by 10 (e.g., 400 -> 40). The 'file_path' column is left unchanged.

    Steps per stats file:
      1) Move the ORIGINAL stats CSV to a backup folder in the CURRENT directory (mirroring subpaths).
         - Never overwrite existing backups; add a _bakN suffix if needed.
      2) Write the UPDATED stats CSV at the original path (no compression), keeping the same columns/order.

    Notes
    -----
    - Only updates stats files that already exist; missing stats are skipped.
    - The numeric columns expected (if present) are:
        min, max, mean, std, q01, q05, q25, q50, q75, q95, q99
      Any subset found will be scaled.
    - Does not modify the 'file_path' column.
    """

    if not os.path.isdir(root_clean):
        print(f"[ERROR] Data root not found: {root_clean}")
        return
    if not os.path.isdir(root_stats):
        print(f"[ERROR] Stats root not found: {root_stats}")
        return

    # discover TEC (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root_clean)
        if os.path.isdir(os.path.join(root_clean, d)) and ("tec" in d.lower()) and ("pv" not in d.lower())
    )
    if not machines:
        print("[INFO] No TEC machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    # Match data files to discover years present for this measurement
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = 0
    skipped = 0
    errors = 0

    for m in machines:
        data_dir = os.path.join(root_clean, m, measurement)
        if not os.path.isdir(data_dir):
            continue

        # Discover all years available for this measurement by data filename
        year_list = []
        for fname in os.listdir(data_dir):
            mo = pat.match(fname)
            if mo:
                year_list.append(int(mo.group(1)))
        if not year_list:
            continue

        # For each year, locate the corresponding stats file and update
        for year in sorted(set(year_list)):
            stats_path = os.path.join(root_stats, m, measurement, f"{year}_{measurement}_stats.csv")
            if not os.path.exists(stats_path):
                # No stats to update, skip quietly
                continue

            try:
                df = pd.read_csv(stats_path)
            except Exception as e:
                print(f"[SKIP] {stats_path}  (read error: {e})")
                errors += 1
                continue

            # Identify numeric stats columns present (scale any subset that exists)
            numeric_cols = ["min","max","mean","std","q01","q05","q25","q50","q75","q95","q99"]
            present_numeric = [c for c in numeric_cols if c in df.columns]

            if not present_numeric:
                print(f"[SKIP] {stats_path}  (no numeric stats columns found)")
                skipped += 1
                continue

            # Coerce to numeric (in case of strings), scale by ÷10, keep dtype float32
            df_new = df.copy()
            old_preview = {}
            new_preview = {}

            for c in present_numeric:
                series_num = pd.to_numeric(df_new[c], errors="coerce")
                old_preview[c] = series_num.max()
                series_scaled = (series_num / 10.0).astype(np.float32)
                df_new[c] = series_scaled
                new_preview[c] = df_new[c].max()

            # Prepare backup path (mirror substructure under backup_dir, unique name)
            rel = os.path.relpath(stats_path, start=os.getcwd())
            safe_rel = rel.replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(safe_rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(stats_path))

            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)  # ext == ".csv"
                k = 1
                while True:
                    candidate = f"{base}_bak{k}{ext}"
                    if not os.path.exists(candidate):
                        backup_path = candidate
                        break
                    k += 1

            try:
                if dry_run:
                    print(f"[DRY] MOVE  {stats_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {stats_path}  (scale ÷10 on {len(present_numeric)} columns: {present_numeric})")
                    # show a quick before/after max preview on a couple of columns
                    show_cols = present_numeric[:3]
                    preview = ", ".join([f"{c}: {old_preview[c]} → {new_preview[c]}" for c in show_cols])
                    print(f"[DRY] PREVIEW max: {preview}")
                else:
                    # Move original stats to backup (do not overwrite existing backup)
                    shutil.move(stats_path, backup_path)
                    # Write updated stats back to original path (no compression)
                    df_new.to_csv(stats_path, index=False)

                print(f"[OK]   {stats_path}  | scaled columns={present_numeric} (backup: {backup_path})")
                processed += 1

            except Exception as e:
                print(f"[FAIL] {stats_path}  (I/O error: {e})")
                errors += 1
                # Try to restore if move happened but write failed
                if not dry_run and (not os.path.exists(stats_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, stats_path)
                        print(f"[RESTORE] Original restored for {stats_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {stats_path}: {e2}")

    print(f"\nSummary (stats ÷10): processed={processed}, skipped={skipped}, errors={errors}")

# --- Example execution (uncomment to run) ---
rescale_needed = [
    "THD_I1", "THD_I2", "THD_I3", "THD_IN",
    "I1_h2", "I1_h3", "I1_h4", "I1_h5", "I1_h7", "I1_h9", "I1_h11", "I1_h13", "I1_h15", "I1_h17", "I1_h19", "I1_h21", "I1_h23", "I1_h25", "I1_h27", "I1_h29", "I1_h31",
    "I2_h2", "I2_h3", "I2_h4", "I2_h5", "I2_h7", "I2_h9", "I2_h11", "I2_h13", "I2_h15", "I2_h17", "I2_h19", "I2_h21", "I2_h23", "I2_h25", "I2_h27", "I2_h29", "I2_h31",
    "I3_h2", "I3_h3", "I3_h4", "I3_h5", "I3_h7", "I3_h9", "I3_h11", "I3_h13", "I3_h15", "I3_h17", "I3_h19", "I3_h21", "I3_h23", "I3_h25", "I3_h27", "I3_h29", "I3_h31",
    "IN_h2", "IN_h3", "IN_h4", "IN_h5", "IN_h7", "IN_h9", "IN_h11", "IN_h13", "IN_h15", "IN_h17", "IN_h19", "IN_h21", "IN_h23", "IN_h25", "IN_h27", "IN_h29", "IN_h31",
]
for meas in rescale_needed:
    rescale_stats_div10_for_tec(meas, root_clean="../dataset_clean", root_stats="../dataset_clean_validation", backup_dir="./backup_stats_csv", dry_run=True)

[DRY] MOVE  ../dataset_clean_validation\TEC_48S\THD_I1\2024_THD_I1_stats.csv  ->  ./backup_stats_csv\__up__\dataset_clean_validation\TEC_48S\THD_I1\2024_THD_I1_stats.csv
[DRY] WRITE ../dataset_clean_validation\TEC_48S\THD_I1\2024_THD_I1_stats.csv  (scale ÷10 on 11 columns: ['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'])
[DRY] PREVIEW max: min: 0.0 → 0.0, max: 99.3 → 9.930000305175781, mean: 1.1162829 → 0.11162828654050827
[OK]   ../dataset_clean_validation\TEC_48S\THD_I1\2024_THD_I1_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_clean_validation\TEC_48S\THD_I1\2024_THD_I1_stats.csv)
[DRY] MOVE  ../dataset_clean_validation\TEC_CFST161\THD_I1\2024_THD_I1_stats.csv  ->  ./backup_stats_csv\__up__\dataset_clean_validation\TEC_CFST161\THD_I1\2024_THD_I1_stats.csv
[DRY] WRITE ../dataset_clean_validation\TEC_CFST161\THD_I1\2024_THD_I1_stats.csv  (scale ÷10 on 1

### Current THD and Harmoncis

In [1]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_I_THD_for_tec(
    measurement: str,
    root: str = "../dataset_clean",
    backup_dir: str = "./backup_original_csv_xz",
    dry_run: bool = False
):
    """
    For all machines in `root` whose folder name contains 'EPI' or 'TEC' (case-insensitive) and NOT 'PV',
    find ALL years available for the given `measurement` (e.g., 'THD_U1', 'THD_U2'), and
    divide the first numeric column by 10 (e.g., 400 -> 40), cast to float32, and save back
    to the ORIGINAL PATH with the same filename & compression (.csv.xz), after first MOVING
    the original file to a backup folder in the CURRENT directory.

    Guarantees:
      - Never overwrites: if a backup path already exists, it appends a numeric suffix.
      - New file uses the EXACT same relative path/filename under `root`, same column names,
        same compression ("xz"), but with rescaled first numeric column (float32).
      - If `dry_run=True`, it only prints what it *would* do.

    Prints a log line per file processed.
    """
    if not os.path.isdir(root):
        print(f"[ERROR] Root not found: {root}")
        return

    # discover EPI or TEC (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root)
        if os.path.isdir(os.path.join(root, d))
        and ("pv" not in d.lower())
        and ("TEC_MV2400R" in d.lower())
    )

    if not machines:
        print("[INFO] No EPI/TEC machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = 0
    skipped = 0
    errors = 0

    for m in machines:
        m_dir = os.path.join(root, m, measurement)
        if not os.path.isdir(m_dir):
            continue

        for fname in os.listdir(m_dir):
            if not pat.match(fname):
                continue

            src_path = os.path.join(m_dir, fname)

            # Read CSV
            try:
                df = pd.read_csv(src_path)
            except Exception as e:
                print(f"[SKIP] {src_path}  (read error: {e})")
                errors += 1
                continue

            num = df.select_dtypes("number")
            if num.empty:
                print(f"[SKIP] {src_path}  (no numeric columns)")
                skipped += 1
                continue

            # Identify first numeric column name
            col = num.columns[0]

            # Prepare backup destination (unique, do not overwrite)
            # Mirror relative path under backup_dir for clarity (optional)
            rel_path = os.path.relpath(src_path, start=os.getcwd())
            safe_rel = rel_path.replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(safe_rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(src_path))

            # If backup_path exists, add suffix
            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)  # ext is ".xz"
                base_csv, _ = os.path.splitext(base)       # remove ".csv" -> base_csv
                k = 1
                while True:
                    candidate = f"{base_csv}_bak{k}.csv.xz"
                    if not os.path.exists(candidate):
                        backup_path = candidate
                        break
                    k += 1

            # Build new DataFrame with rescaled values (÷10), float32
            df_new = df.copy()
            try:
                scaled = (df_new[col].astype(float) / 10.0).astype(np.float32)
                df_new[col] = scaled
            except Exception as e:
                print(f"[SKIP] {src_path}  (scaling error in column '{col}': {e})")
                errors += 1
                continue

            # Move original to backup, then write new file at original path
            try:
                if dry_run:
                    print(f"[DRY] MOVE  {src_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {src_path}  (rescaled {measurement}.{col} ÷10, float32, compression='xz')")
                    try:
                        old_max = float(pd.to_numeric(df[col], errors='coerce').max())
                        new_max = float(pd.to_numeric(df_new[col], errors='coerce').max())
                        print(f"OLD max {old_max} -> NEW max {new_max}")
                    except Exception:
                        pass
                else:
                    os.makedirs(os.path.dirname(backup_path), exist_ok=True)
                    shutil.move(src_path, backup_path)
                    df_new.to_csv(src_path, index=False, compression="xz")

                print(f"[OK]   {src_path}  | col='{col}'  ÷10  float32  (backup: {backup_path})")
                processed += 1
            except Exception as e:
                print(f"[FAIL] {src_path}  (I/O error: {e})")
                errors += 1
                # Attempt to restore original if move already happened and write failed
                if not dry_run and (not os.path.exists(src_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, src_path)
                        print(f"[RESTORE] Original restored from backup for {src_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {src_path}: {e2}")

    print(f"\nSummary: processed={processed}, skipped={skipped}, errors={errors}")

# --- Example usage (uncomment to run) ---
rescale_needed = [
    "THD_I1", "THD_I2", "THD_I3", "THD_IN",
    "I1_h2", "I1_h3", "I1_h4", "I1_h5", "I1_h7", "I1_h9", "I1_h11", "I1_h13", "I1_h15", "I1_h17", "I1_h19", "I1_h21", "I1_h23", "I1_h25", "I1_h27", "I1_h29", "I1_h31",
    "I2_h2", "I2_h3", "I2_h4", "I2_h5", "I2_h7", "I2_h9", "I2_h11", "I2_h13", "I2_h15", "I2_h17", "I2_h19", "I2_h21", "I2_h23", "I2_h25", "I2_h27", "I2_h29", "I2_h31",
    "I3_h2", "I3_h3", "I3_h4", "I3_h5", "I3_h7", "I3_h9", "I3_h11", "I3_h13", "I3_h15", "I3_h17", "I3_h19", "I3_h21", "I3_h23", "I3_h25", "I3_h27", "I3_h29", "I3_h31",
    "IN_h2", "IN_h3", "IN_h4", "IN_h5", "IN_h7", "IN_h9", "IN_h11", "IN_h13", "IN_h15", "IN_h17", "IN_h19", "IN_h21", "IN_h23", "IN_h25", "IN_h27", "IN_h29", "IN_h31",
]
for meas in rescale_needed:
    rescale_I_THD_for_tec(meas, root="../dataset_clean", backup_dir="./backup_original_csv_xz", dry_run=False)


[OK]   ../dataset_clean\TEC_48S\THD_I1\2024_THD_I1.csv.xz  | col='THD_I1'  ÷10  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\TEC_48S\THD_I1\2024_THD_I1.csv.xz)
[OK]   ../dataset_clean\TEC_CFST161\THD_I1\2024_THD_I1.csv.xz  | col='THD_I1'  ÷10  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\TEC_CFST161\THD_I1\2024_THD_I1.csv.xz)
[OK]   ../dataset_clean\TEC_CTX800TC\THD_I1\2024_THD_I1.csv.xz  | col='THD_I1'  ÷10  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\TEC_CTX800TC\THD_I1\2024_THD_I1.csv.xz)
[OK]   ../dataset_clean\TEC_Chiron800\THD_I1\2024_THD_I1.csv.xz  | col='THD_I1'  ÷10  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\TEC_Chiron800\THD_I1\2024_THD_I1.csv.xz)
[OK]   ../dataset_clean\TEC_DMF3008\THD_I1\2024_THD_I1.csv.xz  | col='THD_I1'  ÷10  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\TEC_DMF3008\THD_I1\2024_THD_I1.csv.xz)
[OK]   ../dataset_clean\TEC_DMU125MB\THD_I1\2024_THD_I1.csv.xz  | c

In [2]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_stats_div10_for_tec(
    measurement: str,
    root_clean: str = "../dataset_clean",
    root_stats: str = "../dataset_clean_validation",
    backup_dir: str = "./backup_stats_csv",
    dry_run: bool = False
):
    """
    For all machines in `root_clean` whose folder name contains 'EPI' or 'TEC' (case-insensitive) and NOT 'PV',
    find ALL years available for the given `measurement` by data filenames:
        <root_clean>/<machine>/<measurement>/<year>_<measurement>.csv.xz
    then locate the corresponding stats CSV:
        <root_stats>/<machine>/<measurement>/<year>_<measurement>_stats.csv
    and divide ALL numeric stats columns by 10 (e.g., 400 -> 40). The 'file_path' column is left unchanged.

    Steps per stats file:
      1) Move the ORIGINAL stats CSV to a backup folder in the CURRENT directory (mirroring subpaths).
         - Never overwrite existing backups; add a _bakN suffix if needed.
      2) Write the UPDATED stats CSV at the original path (no compression), keeping the same columns/order.

    Notes
    -----
    - Only updates stats files that already exist; missing stats are skipped.
    - The numeric columns expected (if present) are:
        min, max, mean, std, q01, q05, q25, q50, q75, q95, q99
      Any subset found will be scaled.
    - Does not modify the 'file_path' column.
    """

    if not os.path.isdir(root_clean):
        print(f"[ERROR] Data root not found: {root_clean}")
        return
    if not os.path.isdir(root_stats):
        print(f"[ERROR] Stats root not found: {root_stats}")
        return

    # discover EPI/TEC (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root_clean)
        if os.path.isdir(os.path.join(root_clean, d))
        and ("pv" not in d.lower())
        and ("tec" in d.lower())
    )
    if not machines:
        print("[INFO] No EPI/TEC machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    # Match data files to discover years present for this measurement
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = 0
    skipped = 0
    errors = 0

    for m in machines:
        data_dir = os.path.join(root_clean, m, measurement)
        if not os.path.isdir(data_dir):
            continue

        # Discover all years available for this measurement by data filename
        year_list = []
        for fname in os.listdir(data_dir):
            mo = pat.match(fname)
            if mo:
                year_list.append(int(mo.group(1)))
        if not year_list:
            continue

        # For each year, locate the corresponding stats file and update
        for year in sorted(set(year_list)):
            stats_path = os.path.join(root_stats, m, measurement, f"{year}_{measurement}_stats.csv")
            if not os.path.exists(stats_path):
                # No stats to update, skip quietly
                continue

            try:
                df = pd.read_csv(stats_path)
            except Exception as e:
                print(f"[SKIP] {stats_path}  (read error: {e})")
                errors += 1
                continue

            # Identify numeric stats columns present (scale any subset that exists)
            numeric_cols = ["min","max","mean","std","q01","q05","q25","q50","q75","q95","q99"]
            present_numeric = [c for c in numeric_cols if c in df.columns]

            if not present_numeric:
                print(f"[SKIP] {stats_path}  (no numeric stats columns found)")
                skipped += 1
                continue

            # Coerce to numeric (in case of strings), scale by ÷10, keep dtype float32
            df_new = df.copy()
            old_preview = {}
            new_preview = {}

            for c in present_numeric:
                series_num = pd.to_numeric(df_new[c], errors="coerce")
                old_preview[c] = series_num.max()
                series_scaled = (series_num / 10.0).astype(np.float32)
                df_new[c] = series_scaled
                new_preview[c] = df_new[c].max()

            # Prepare backup path (mirror substructure under backup_dir, unique name)
            rel = os.path.relpath(stats_path, start=os.getcwd())
            safe_rel = rel.replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(safe_rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(stats_path))

            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)  # ext == ".csv"
                k = 1
                while True:
                    candidate = f"{base}_bak{k}{ext}"
                    if not os.path.exists(candidate):
                        backup_path = candidate
                        break
                    k += 1

            try:
                if dry_run:
                    print(f"[DRY] MOVE  {stats_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {stats_path}  (scale ÷10 on {len(present_numeric)} columns: {present_numeric})")
                    # show a quick before/after max preview on a couple of columns
                    show_cols = present_numeric[:3]
                    preview = ", ".join([f"{c}: {old_preview[c]} → {new_preview[c]}" for c in show_cols])
                    print(f"[DRY] PREVIEW max: {preview}")
                else:
                    # Move original stats to backup (do not overwrite existing backup)
                    shutil.move(stats_path, backup_path)
                    # Write updated stats back to original path (no compression)
                    df_new.to_csv(stats_path, index=False)

                print(f"[OK]   {stats_path}  | scaled columns={present_numeric} (backup: {backup_path})")
                processed += 1

            except Exception as e:
                print(f"[FAIL] {stats_path}  (I/O error: {e})")
                errors += 1
                # Try to restore if move happened but write failed
                if not dry_run and (not os.path.exists(stats_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, stats_path)
                        print(f"[RESTORE] Original restored for {stats_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {stats_path}: {e2}")

    print(f"\nSummary (stats ÷10, EPI+TEC): processed={processed}, skipped={skipped}, errors={errors}")


# --- Example execution (uncomment to run) ---
rescale_needed = [
    "THD_I1", "THD_I2", "THD_I3", "THD_IN",
    "I1_h2", "I1_h3", "I1_h4", "I1_h5", "I1_h7", "I1_h9", "I1_h11", "I1_h13", "I1_h15", "I1_h17", "I1_h19", "I1_h21", "I1_h23", "I1_h25", "I1_h27", "I1_h29", "I1_h31",
    "I2_h2", "I2_h3", "I2_h4", "I2_h5", "I2_h7", "I2_h9", "I2_h11", "I2_h13", "I2_h15", "I2_h17", "I2_h19", "I2_h21", "I2_h23", "I2_h25", "I2_h27", "I2_h29", "I2_h31",
    "I3_h2", "I3_h3", "I3_h4", "I3_h5", "I3_h7", "I3_h9", "I3_h11", "I3_h13", "I3_h15", "I3_h17", "I3_h19", "I3_h21", "I3_h23", "I3_h25", "I3_h27", "I3_h29", "I3_h31",
    "IN_h2", "IN_h3", "IN_h4", "IN_h5", "IN_h7", "IN_h9", "IN_h11", "IN_h13", "IN_h15", "IN_h17", "IN_h19", "IN_h21", "IN_h23", "IN_h25", "IN_h27", "IN_h29", "IN_h31",
]
for meas in rescale_needed:
    rescale_stats_div10_for_tec(meas, root_clean="../dataset_clean", root_stats="../dataset_clean_validation", backup_dir="./backup_stats_csv", dry_run=False)


[OK]   ../dataset_clean_validation\TEC_48S\THD_I1\2024_THD_I1_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_clean_validation\TEC_48S\THD_I1\2024_THD_I1_stats.csv)
[OK]   ../dataset_clean_validation\TEC_CFST161\THD_I1\2024_THD_I1_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_clean_validation\TEC_CFST161\THD_I1\2024_THD_I1_stats.csv)
[OK]   ../dataset_clean_validation\TEC_CTX800TC\THD_I1\2024_THD_I1_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_clean_validation\TEC_CTX800TC\THD_I1\2024_THD_I1_stats.csv)
[OK]   ../dataset_clean_validation\TEC_Chiron800\THD_I1\2024_THD_I1_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backu

### Power

In [2]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_kw_to_w_for_epi(measurement: str, root: str = "../dataset_clean", backup_dir: str = "./backup_original_csv_xz", dry_run: bool = False):
    """
    For all machines in `root` whose folder name contains 'EPI' (case-insensitive) and NOT 'PV',
    find ALL years available for the given `measurement` (e.g., 'P_total', 'P1'), and
    rescale the first numeric column from kW to W (×1000), cast to float32, and save back
    to the ORIGINAL PATH with the same filename & compression (.csv.xz), after first MOVING
    the original file to a backup folder in the CURRENT directory.

    Guarantees:
      - Never overwrites: if a backup path already exists, it appends a numeric suffix.
      - New file uses the EXACT same relative path/filename under `root`, same column names,
        same compression ("xz"), but with rescaled first numeric column (float32).
      - If `dry_run=True`, it only prints what it *would* do.

    Prints a log line per file processed.
    """
    if not os.path.isdir(root):
        print(f"[ERROR] Root not found: {root}")
        return

    # discover EPI (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root)
        if os.path.isdir(os.path.join(root, d)) and ("epi" in d.lower()) and ("pv" not in d.lower())
    )

    if not machines:
        print("[INFO] No EPI machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = 0
    skipped = 0
    errors = 0

    for m in machines:
        m_dir = os.path.join(root, m, measurement)
        if not os.path.isdir(m_dir):
            continue

        for fname in os.listdir(m_dir):
            if not pat.match(fname):
                continue

            src_path = os.path.join(m_dir, fname)

            # Read CSV
            try:
                df = pd.read_csv(src_path)
            except Exception as e:
                print(f"[SKIP] {src_path}  (read error: {e})")
                errors += 1
                continue

            num = df.select_dtypes("number")
            if num.empty:
                print(f"[SKIP] {src_path}  (no numeric columns)")
                skipped += 1
                continue

            # Identify first numeric column name
            col = num.columns[0]

            # Prepare backup destination (unique, do not overwrite)
            # Mirror relative path under backup_dir for clarity (optional)
            rel_path = os.path.relpath(src_path, start=os.getcwd())
            # Clean rel path for windows backslashes in folder name creation
            safe_rel = rel_path.replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(safe_rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(src_path))

            # If backup_path exists, add suffix
            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)  # ext is ".xz"
                base_csv, _ = os.path.splitext(base)       # remove .csv -> base_csv
                # restore .csv.xz correctly
                base_csv = base_csv  # already without .csv
                # Find a free suffix
                k = 1
                while True:
                    candidate = f"{base_csv}_bak{k}.csv.xz"
                    if not os.path.exists(candidate):
                        backup_path = candidate
                        break
                    k += 1

            # Build new DataFrame with rescaled values (kW -> W), float32
            df_new = df.copy()
            try:
                # multiply by 1000 and cast to float32
                scaled = (df_new[col].astype(float) * 1000.0).astype(np.float32)
                df_new[col] = scaled
            except Exception as e:
                print(f"[SKIP] {src_path}  (scaling error in column '{col}': {e})")
                errors += 1
                continue

            # Move original to backup, then write new file at original path
            try:
                if dry_run:
                    print(f"[DRY] MOVE  {src_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {src_path}  (rescaled {measurement}.{col} kW→W, float32, compression='xz')")
                    print(f"OLD {df[col].max()} - NEW {df_new[col].max()}")
                else:
                    # Move (never overwrite existing backup)
                    os.makedirs(os.path.dirname(backup_path), exist_ok=True)
                    shutil.move(src_path, backup_path)

                    # Write new file with the exact same naming & compression
                    df_new.to_csv(src_path, index=False, compression="xz")

                print(f"[OK]   {src_path}  | col='{col}'  kW→W  float32  (backup: {backup_path})")
                processed += 1
            except Exception as e:
                print(f"[FAIL] {src_path}  (I/O error: {e})")
                errors += 1
                # Attempt to restore original if move already happened and write failed
                if not dry_run and (not os.path.exists(src_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, src_path)
                        print(f"[RESTORE] Original restored from backup for {src_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {src_path}: {e2}")

    print(f"\nSummary: processed={processed}, skipped={skipped}, errors={errors}")

# --- Example usage (uncomment to run) ---
rescale_needed = ["S2", "S3", "S_total", "S_total_vec", "P1", "P2", "P3", "P_total", "Q1", "Q2", "Q3", "Q_total", "Q_total_vec"]
for meas in rescale_needed:
#rescale_kw_to_w_for_epi("P_total", root="../dataset_clean", backup_dir="./backup_original_csv_xz", dry_run=True)
    rescale_kw_to_w_for_epi(meas, root="../dataset_clean", backup_dir="./backup_original_csv_xz", dry_run=False)


[OK]   ../dataset_clean\EPI_ChipPress\S2\2018_S2.csv.xz  | col='S2'  kW→W  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\EPI_ChipPress\S2\2018_S2.csv.xz)
[OK]   ../dataset_clean\EPI_ChipPress\S2\2019_S2.csv.xz  | col='S2'  kW→W  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\EPI_ChipPress\S2\2019_S2.csv.xz)
[OK]   ../dataset_clean\EPI_ChipPress\S2\2020_S2.csv.xz  | col='S2'  kW→W  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\EPI_ChipPress\S2\2020_S2.csv.xz)
[OK]   ../dataset_clean\EPI_ChipPress\S2\2021_S2.csv.xz  | col='S2'  kW→W  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\EPI_ChipPress\S2\2021_S2.csv.xz)
[OK]   ../dataset_clean\EPI_ChipPress\S2\2022_S2.csv.xz  | col='S2'  kW→W  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\EPI_ChipPress\S2\2022_S2.csv.xz)
[OK]   ../dataset_clean\EPI_ChipPress\S2\2023_S2.csv.xz  | col='S2'  kW→W  float32  (backup: ./backup_original_csv_xz\__up__\dataset_clean\E

In [3]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_stats_kw_to_w_for_epi(
    measurement: str,
    root_clean: str = "../dataset_clean",
    root_stats: str = "../dataset_clean_validation",
    backup_dir: str = "./backup_stats_csv",
    dry_run: bool = False
):
    """
    For all machines in `root_clean` whose folder name contains 'EPI' (case-insensitive) and NOT 'PV',
    find ALL years available for the given `measurement` (e.g., 'P_total', 'P1'), locate the
    corresponding stats CSV:
        <root_stats>/<machine>/<measurement>/<year>_<measurement>_stats.csv
    and scale ALL numeric stats columns by 1000 (kW -> W). The 'file_path' column is left unchanged.

    Steps per stats file:
      1) Move the ORIGINAL stats CSV to a backup folder in the CURRENT directory (mirroring subpaths).
         - Never overwrite existing backups; add a _bakN suffix if needed.
      2) Write the UPDATED stats CSV at the original path (no compression), keeping the same columns/order.

    Parameters
    ----------
    measurement : str
        The measurement name (e.g., "P_total", "P1", "S1").
    root_clean : str
        Root directory of the raw data files (used to discover years by filename pattern).
    root_stats : str
        Root directory where the stats CSVs reside (to be updated).
    backup_dir : str
        Backup directory in the current working directory to store originals before writing updates.
    dry_run : bool
        If True, only prints actions; does not move or write any files.

    Notes
    -----
    - Only updates stats files that already exist; missing stats are skipped.
    - The numeric columns expected (if present) are:
        min, max, mean, std, q01, q05, q25, q50, q75, q95, q99
      Any subset found will be scaled.
    - Does not modify the 'file_path' column.
    """

    if not os.path.isdir(root_clean):
        print(f"[ERROR] Data root not found: {root_clean}")
        return
    if not os.path.isdir(root_stats):
        print(f"[ERROR] Stats root not found: {root_stats}")
        return

    # discover EPI (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root_clean)
        if os.path.isdir(os.path.join(root_clean, d)) and ("epi" in d.lower()) and ("pv" not in d.lower())
    )
    if not machines:
        print("[INFO] No EPI machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    # Match data files to discover years present for this measurement
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = 0
    skipped = 0
    errors = 0

    for m in machines:
        data_dir = os.path.join(root_clean, m, measurement)
        if not os.path.isdir(data_dir):
            continue

        # Discover all years available for this measurement by data filename
        year_list = []
        for fname in os.listdir(data_dir):
            mo = pat.match(fname)
            if mo:
                year_list.append(int(mo.group(1)))
        if not year_list:
            continue

        # For each year, locate the corresponding stats file and update
        for year in sorted(set(year_list)):
            stats_path = os.path.join(root_stats, m, measurement, f"{year}_{measurement}_stats.csv")
            if not os.path.exists(stats_path):
                # No stats to update, skip quietly
                continue

            try:
                df = pd.read_csv(stats_path)
            except Exception as e:
                print(f"[SKIP] {stats_path}  (read error: {e})")
                errors += 1
                continue

            # Identify numeric stats columns present (scale any subset that exists)
            numeric_cols = ["min","max","mean","std","q01","q05","q25","q50","q75","q95","q99"]
            present_numeric = [c for c in numeric_cols if c in df.columns]

            if not present_numeric:
                print(f"[SKIP] {stats_path}  (no numeric stats columns found)")
                skipped += 1
                continue

            # Coerce to numeric (in case of strings), scale by 1000, keep dtype float32
            df_new = df.copy()
            old_preview = {}
            new_preview = {}

            for c in present_numeric:
                series_num = pd.to_numeric(df_new[c], errors="coerce")
                old_preview[c] = series_num.max()
                series_scaled = (series_num * 1000.0).astype(np.float32)
                df_new[c] = series_scaled
                new_preview[c] = df_new[c].max()

            # Prepare backup path (mirror substructure under backup_dir, unique name)
            rel = os.path.relpath(stats_path, start=os.getcwd())
            safe_rel = rel.replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(safe_rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(stats_path))

            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)  # ext == ".csv"
                k = 1
                while True:
                    candidate = f"{base}_bak{k}{ext}"
                    if not os.path.exists(candidate):
                        backup_path = candidate
                        break
                    k += 1

            try:
                if dry_run:
                    print(f"[DRY] MOVE  {stats_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {stats_path}  (scale ×1000 on {len(present_numeric)} columns: {present_numeric})")
                    # show a quick before/after max preview on a couple of columns
                    show_cols = present_numeric[:3]
                    preview = ", ".join([f"{c}: {old_preview[c]} → {new_preview[c]}" for c in show_cols])
                    print(f"[DRY] PREVIEW max: {preview}")
                else:
                    # Move original stats to backup (do not overwrite existing backup)
                    shutil.move(stats_path, backup_path)
                    # Write updated stats back to original path (no compression)
                    df_new.to_csv(stats_path, index=False)

                print(f"[OK]   {stats_path}  | scaled columns={present_numeric} (backup: {backup_path})")
                processed += 1

            except Exception as e:
                print(f"[FAIL] {stats_path}  (I/O error: {e})")
                errors += 1
                # Try to restore if move happened but write failed
                if not dry_run and (not os.path.exists(stats_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, stats_path)
                        print(f"[RESTORE] Original restored for {stats_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {stats_path}: {e2}")

    print(f"\nSummary (stats): processed={processed}, skipped={skipped}, errors={errors}")

# --- Example execution (uncomment to run) ---
rescale_needed = ["S2", "S3", "S_total", "S_total_vec", "P1", "P2", "P3", "P_total", "Q1", "Q2", "Q3", "Q_total", "Q_total_vec"]
for meas in rescale_needed:
# rescale_stats_kw_to_w_for_epi("S1", root_clean="../dataset_clean", root_stats="../dataset_clean_validation", backup_dir="./backup_stats_csv", dry_run=True)
    rescale_stats_kw_to_w_for_epi(meas, root_clean="../dataset_clean", root_stats="../dataset_clean_validation", backup_dir="./backup_stats_csv", dry_run=False)


[OK]   ../dataset_clean_validation\EPI_ChipPress\S2\2018_S2_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_clean_validation\EPI_ChipPress\S2\2018_S2_stats.csv)
[OK]   ../dataset_clean_validation\EPI_ChipPress\S2\2019_S2_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_clean_validation\EPI_ChipPress\S2\2019_S2_stats.csv)
[OK]   ../dataset_clean_validation\EPI_ChipPress\S2\2020_S2_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_clean_validation\EPI_ChipPress\S2\2020_S2_stats.csv)
[OK]   ../dataset_clean_validation\EPI_ChipPress\S2\2021_S2_stats.csv  | scaled columns=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_csv\__up__\dataset_c

### I_sys

In [None]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_isys_div1000_for_epi_tec(
    measurement: str = "I_sys",
    root: str = "../dataset_clean",
    backup_dir: str = "./backup_original_csv_xz_isys",
    dry_run: bool = False
):
    """
    For all machines in `root` whose folder name contains 'EPI' or 'TEC' (case-insensitive)
    and does NOT contain 'PV' (e.g., excludes '*_PV*'), find ALL years available for the given
    `measurement` (default: 'I_sys'), divide the first numeric column by 1000 (e.g., 40000→40),
    cast to float32, and write back to the SAME PATH (same filename, same compression '.csv.xz').

    Safety:
      - Before writing, MOVE the original file into a backup folder in the CURRENT directory.
      - Never overwrite backups: if a backup path already exists, append a numeric suffix.
      - Column names and file naming remain EXACTLY the same.
      - If `dry_run=True`, only prints what it would do; does not move or write any files.
    """
    if not os.path.isdir(root):
        print(f"[ERROR] Root not found: {root}")
        return

    # discover EPI/TEC (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root)
        if os.path.isdir(os.path.join(root, d))
        and (("epi" in d.lower()) or ("tec" in d.lower()))
        and ("pv" not in d.lower())
    )
    if not machines:
        print("[INFO] No EPI/TEC machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = 0
    skipped = 0
    errors = 0

    for m in machines:
        m_dir = os.path.join(root, m, measurement)
        if not os.path.isdir(m_dir):
            continue

        for fname in os.listdir(m_dir):
            if not pat.match(fname):
                continue

            src_path = os.path.join(m_dir, fname)

            # Read CSV
            try:
                df = pd.read_csv(src_path)
            except Exception as e:
                print(f"[SKIP] {src_path}  (read error: {e})")
                errors += 1
                continue

            num = df.select_dtypes("number")
            if num.empty:
                print(f"[SKIP] {src_path}  (no numeric columns)")
                skipped += 1
                continue

            # First numeric column name
            col = num.columns[0]

            # Prepare backup destination (unique; do not overwrite)
            rel_path = os.path.relpath(src_path, start=os.getcwd())
            safe_rel = rel_path.replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(safe_rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(src_path))

            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)   # ext: ".xz"
                base_csv, _ = os.path.splitext(base)        # drop ".csv"
                k = 1
                while True:
                    candidate = f"{base_csv}_bak{k}.csv.xz"
                    if not os.path.exists(candidate):
                        backup_path = candidate
                        break
                    k += 1

            # Build new DataFrame with rescaled values (÷1000), float32
            df_new = df.copy()
            try:
                scaled = (df_new[col].astype(float) * 1000.0).astype(np.float32)
                df_new[col] = scaled
            except Exception as e:
                print(f"[SKIP] {src_path}  (scaling error in column '{col}': {e})")
                errors += 1
                continue

            # Move original to backup, then write new file at original path
            try:
                if dry_run:
                    print(f"[DRY] MOVE  {src_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {src_path}  (rescaled {measurement}.{col} ÷1000, float32, compression='xz')")
                    # quick preview
                    try:
                        old_max = float(pd.to_numeric(df[col], errors="coerce").max())
                        new_max = float(df_new[col].max())
                        print(f"[DRY] PREVIEW max: {old_max} → {new_max}")
                    except Exception:
                        pass
                else:
                    shutil.move(src_path, backup_path)
                    df_new.to_csv(src_path, index=False, compression="xz")

                print(f"[OK]   {src_path}  | col='{col}'  ÷1000  float32  (backup: {backup_path})")
                processed += 1
            except Exception as e:
                print(f"[FAIL] {src_path}  (I/O error: {e})")
                errors += 1
                # Try to restore if move happened but write failed
                if not dry_run and (not os.path.exists(src_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, src_path)
                        print(f"[RESTORE] Original restored for {src_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {src_path}: {e2}")

    print(f"\nSummary: processed={processed}, skipped={skipped}, errors={errors}")

# --- Example usage (uncomment to run) ---
# rescale_isys_div1000_for_epi_tec(measurement="I_sys", root="../dataset_clean", backup_dir="./backup_original_csv_xz_isys", dry_run=True)
rescale_isys_div1000_for_epi_tec(measurement="I_sys", root="../dataset_clean", backup_dir="./backup_original_csv_xz_isys", dry_run=True)


[DRY] MOVE  ../dataset_clean\EPI_ChipPress\I_sys\2018_I_sys.csv.xz  ->  ./backup_original_csv_xz_isys\__up__\dataset_clean\EPI_ChipPress\I_sys\2018_I_sys_bak1.csv.xz
[DRY] WRITE ../dataset_clean\EPI_ChipPress\I_sys\2018_I_sys.csv.xz  (rescaled I_sys.I_sys ÷1000, float32, compression='xz')
[DRY] PREVIEW max: 41.785 → 41785.0
[OK]   ../dataset_clean\EPI_ChipPress\I_sys\2018_I_sys.csv.xz  | col='I_sys'  ÷1000  float32  (backup: ./backup_original_csv_xz_isys\__up__\dataset_clean\EPI_ChipPress\I_sys\2018_I_sys_bak1.csv.xz)
[DRY] MOVE  ../dataset_clean\EPI_ChipPress\I_sys\2019_I_sys.csv.xz  ->  ./backup_original_csv_xz_isys\__up__\dataset_clean\EPI_ChipPress\I_sys\2019_I_sys_bak1.csv.xz
[DRY] WRITE ../dataset_clean\EPI_ChipPress\I_sys\2019_I_sys.csv.xz  (rescaled I_sys.I_sys ÷1000, float32, compression='xz')
[DRY] PREVIEW max: 41.258 → 41258.0
[OK]   ../dataset_clean\EPI_ChipPress\I_sys\2019_I_sys.csv.xz  | col='I_sys'  ÷1000  float32  (backup: ./backup_original_csv_xz_isys\__up__\dataset_cl

In [5]:
import os
import re
import shutil
import pandas as pd
import numpy as np

def rescale_stats_isys_div1000(
    measurement: str = "I_sys",
    root_clean: str = "../dataset_clean",
    root_stats: str = "../dataset_clean_validation",
    backup_dir: str = "./backup_stats_isys_csv",
    dry_run: bool = False
):
    """
    For all machines in `root_clean` whose folder name contains 'EPI' or 'TEC' (case-insensitive)
    and does NOT contain 'PV', find ALL years available for `measurement` (default: 'I_sys'),
    then locate corresponding stats CSVs in `root_stats`:
        <root_stats>/<machine>/<measurement>/<year>_<measurement>_stats.csv
    and scale ALL numeric stats columns by ÷1000 (e.g., 40000 → 40). 'file_path' is left unchanged.

    Safety:
      1) Move original stats CSV to a backup folder in the CURRENT directory (mirroring subpaths).
         - Never overwrite backups; if a name exists, append _bakN.
      2) Write UPDATED stats back to the original path (no compression), preserving columns/order.

    Only updates existing stats files. Missing stats are skipped.
    Numeric columns (if present): min,max,mean,std,q01,q05,q25,q50,q75,q95,q99
    """
    if not os.path.isdir(root_clean):
        print(f"[ERROR] Data root not found: {root_clean}")
        return
    if not os.path.isdir(root_stats):
        print(f"[ERROR] Stats root not found: {root_stats}")
        return

    # discover EPI/TEC (non-PV) machines
    machines = sorted(
        d for d in os.listdir(root_clean)
        if os.path.isdir(os.path.join(root_clean, d))
        and (("epi" in d.lower()) or ("tec" in d.lower()))
        and ("pv" not in d.lower())
    )
    if not machines:
        print("[INFO] No EPI/TEC machines found (excluding PV).")
        return

    os.makedirs(backup_dir, exist_ok=True)
    pat = re.compile(rf"^(\d{{4}})_{re.escape(measurement)}\.csv\.xz$", re.IGNORECASE)

    processed = skipped = errors = 0

    for m in machines:
        data_dir = os.path.join(root_clean, m, measurement)
        if not os.path.isdir(data_dir):
            continue

        # discover years by data filenames
        years = []
        for fname in os.listdir(data_dir):
            mo = pat.match(fname)
            if mo:
                years.append(int(mo.group(1)))
        if not years:
            continue

        for year in sorted(set(years)):
            stats_path = os.path.join(root_stats, m, measurement, f"{year}_{measurement}_stats.csv")
            if not os.path.exists(stats_path):
                continue

            try:
                df = pd.read_csv(stats_path)
            except Exception as e:
                print(f"[SKIP] {stats_path}  (read error: {e})")
                errors += 1
                continue

            numeric_cols = ["min","max","mean","std","q01","q05","q25","q50","q75","q95","q99"]
            present = [c for c in numeric_cols if c in df.columns]
            if not present:
                print(f"[SKIP] {stats_path}  (no numeric stats columns)")
                skipped += 1
                continue

            df_new = df.copy()
            old_preview, new_preview = {}, {}
            try:
                for c in present:
                    ser = pd.to_numeric(df_new[c], errors="coerce")
                    old_preview[c] = ser.max()
                    df_new[c] = (ser / 1000.0).astype(np.float32)
                    new_preview[c] = df_new[c].max()
            except Exception as e:
                print(f"[SKIP] {stats_path}  (scaling error: {e})")
                errors += 1
                continue

            # prepare unique backup path
            rel = os.path.relpath(stats_path, start=os.getcwd()).replace("..", "__up__")
            backup_subdir = os.path.join(backup_dir, os.path.dirname(rel))
            os.makedirs(backup_subdir, exist_ok=True)
            backup_path = os.path.join(backup_subdir, os.path.basename(stats_path))
            if os.path.exists(backup_path):
                base, ext = os.path.splitext(backup_path)
                k = 1
                while True:
                    cand = f"{base}_bak{k}{ext}"
                    if not os.path.exists(cand):
                        backup_path = cand
                        break
                    k += 1

            try:
                if dry_run:
                    print(f"[DRY] MOVE  {stats_path}  ->  {backup_path}")
                    print(f"[DRY] WRITE {stats_path}  (÷1000 on {len(present)} columns: {present})")
                    show = present[:3]
                    preview = ", ".join([f"{c}: {old_preview[c]} → {new_preview[c]}" for c in show])
                    print(f"[DRY] PREVIEW max: {preview}")
                else:
                    shutil.move(stats_path, backup_path)
                    df_new.to_csv(stats_path, index=False)

                print(f"[OK]   {stats_path}  | scaled cols={present} (backup: {backup_path})")
                processed += 1
            except Exception as e:
                print(f"[FAIL] {stats_path}  (I/O error: {e})")
                errors += 1
                if not dry_run and (not os.path.exists(stats_path)) and os.path.exists(backup_path):
                    try:
                        shutil.move(backup_path, stats_path)
                        print(f"[RESTORE] Original restored for {stats_path}")
                    except Exception as e2:
                        print(f"[RESTORE-FAIL] Could not restore original for {stats_path}: {e2}")

    print(f"\nSummary (I_sys stats): processed={processed}, skipped={skipped}, errors={errors}")

# --- Example execution (uncomment to run) ---
# rescale_stats_isys_div1000(measurement="I_sys", root_clean="../dataset_clean", root_stats="../dataset_clean_validation", backup_dir="./backup_stats_isys_csv", dry_run=True)
rescale_stats_isys_div1000(measurement="I_sys", root_clean="../dataset_clean", root_stats="../dataset_clean_validation", backup_dir="./backup_stats_isys_csv", dry_run=False)


[OK]   ../dataset_clean_validation\EPI_ChipPress\I_sys\2018_I_sys_stats.csv  | scaled cols=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_isys_csv\__up__\dataset_clean_validation\EPI_ChipPress\I_sys\2018_I_sys_stats.csv)
[OK]   ../dataset_clean_validation\EPI_ChipPress\I_sys\2019_I_sys_stats.csv  | scaled cols=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_isys_csv\__up__\dataset_clean_validation\EPI_ChipPress\I_sys\2019_I_sys_stats.csv)
[OK]   ../dataset_clean_validation\EPI_ChipPress\I_sys\2020_I_sys_stats.csv  | scaled cols=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99'] (backup: ./backup_stats_isys_csv\__up__\dataset_clean_validation\EPI_ChipPress\I_sys\2020_I_sys_stats.csv)
[OK]   ../dataset_clean_validation\EPI_ChipPress\I_sys\2021_I_sys_stats.csv  | scaled cols=['min', 'max', 'mean', 'std', 'q01', 'q05', 'q25', 'q50', 'q75', 'q95', 'q99']