In [1]:
"""
We systematically scanned all per-file statistics to detect series whose core and quantile metrics are entirely zero or NaN. 
These cases were registered in a canonical list (machine, measurement, year) and the corresponding originals were replaced
by explicit *_EMPTY placeholder files (single timestamp–value row), with originals safely backed up.
If a measurement folder contained only placeholders, it was renamed to _EMPTY for immediate visibility.
Every action (backup, rewrite, rename, skip) was logged for auditability. This ensures downstream pipelines remain stable,
comparisons are fair, and data quality issues are transparent and reproducible.
"""

'\nWe systematically scanned all per-file statistics to detect series whose core and quantile metrics are entirely zero or NaN. \nThese cases were registered in a canonical list (machine, measurement, year) and the corresponding originals were replaced\nby explicit *_EMPTY placeholder files (single timestamp–value row), with originals safely backed up.\nIf a measurement folder contained only placeholders, it was renamed to _EMPTY for immediate visibility.\nEvery action (backup, rewrite, rename, skip) was logged for auditability. This ensures downstream pipelines remain stable,\ncomparisons are fair, and data quality issues are transparent and reproducible.\n'

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import re

class ZeroNanChecker:
    COLS = ["file_path","min","max","mean","std","q01","q05","q25","q50","q75","q95","q99"]

    def scan(self, root="validation_results", test=None):
        files = sorted(Path(root).rglob("*_stats.csv"))
        if test: files = files[:int(test)]
        rows = []
        for f in files:
            try:
                df = pd.read_csv(f)
            except Exception as e:
                rows.append(self.empty_row(str(f), e))
                continue

            for c in self.COLS:
                if c not in df: df[c] = np.nan
            num_cols = [c for c in self.COLS if c != "file_path"]
            df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

            for _, r in df.iterrows():
                rows.append(self.eval_row(f, r))

        out = pd.DataFrame(rows)
        self.save(out)
        display(out)
        return out

    def eval_row(self, f, r):
        z = lambda x: (pd.isna(x) or x == 0)

        all_stats = [r.get(c) for c in self.COLS if c != "file_path"]
        core_stats = [r.get("min"), r.get("max"), r.get("mean"), r.get("std")]
        quantiles = [r.get("q01"), r.get("q05"), r.get("q25"), r.get("q50"), r.get("q75"), r.get("q95"), r.get("q99")]
        mn, mx = r.get("min"), r.get("max")

        all_flag = all(z(v) for v in all_stats)
        core_flag = all(z(v) for v in core_stats)
        mm_flag = z(mn) and z(mx)
        q_flag = all(z(v) for v in quantiles)

        if all_flag: label = "all_stats_zero_or_nan"
        elif core_flag: label = "core_stats_zero_or_nan"
        elif mm_flag: label = "min_max_zero_or_nan"
        elif q_flag: label = "quantiles_zero_or_nan"
        else: label = "none"

        name = self.make_name(f)

        return {
            "name": name,
            "all_ZoN": bool(all_flag),
            "core_ZoN": bool(core_flag),
            "mm_ZoN": bool(mm_flag),
            "q_ZoN": bool(q_flag),
            "min": mn, "max": mx, "mean": r.get("mean"), "std": r.get("std"),
            "q01": r.get("q01"), "q05": r.get("q05"), "q25": r.get("q25"),
            "q50": r.get("q50"), "q75": r.get("q75"), "q95": r.get("q95"), "q99": r.get("q99"),
            "flag_label": label,
            "file": str(f)
        }

    def make_name(self, fpath):
        parts = fpath.parts
        if len(parts) >= 3:
            measurement = parts[-2]
            m = re.search(r"(\d{4})", parts[-1])
            if m:
                return f"{measurement}_{m.group(1)}"
        return parts[-1]

    def empty_row(self, f, error):
        return {
            "name": self.make_name(Path(f)),
            "all_ZoN": np.nan, "core_ZoN": np.nan, "mm_ZoN": np.nan, "q_ZoN": np.nan,
            **{c: np.nan for c in self.COLS if c != "file_path"},
            "flag_label": "error",
            "file": f,
            "error": error
        }

    def save(self, df, path="all_zero_or_nan_report.csv"):
        order = [
            "name", "all_ZoN", "core_ZoN", "mm_ZoN", "q_ZoN",
            "min","max","mean","std","q01","q05","q25","q50","q75","q95","q99",
            "flag_label","file"
        ]
        cols = [c for c in order if c in df.columns]
        df[cols].to_csv(path, index=False)

    def plot_row(self, results_df: pd.DataFrame, idx: int, data_root="dataset_clean"):
        stats_path = Path(results_df.loc[idx, "file"])
        if not stats_path.exists():
            raise FileNotFoundError(f"Stats file not found: {stats_path}")

        data_path = None
        try:
            sdf = pd.read_csv(stats_path, usecols=["file_path"])
            if len(sdf) > 0 and pd.notna(sdf.loc[0, "file_path"]):
                data_path = Path(str(sdf.loc[0, "file_path"]))
        except Exception:
            pass

        if data_path is None:
            rel = stats_path.as_posix()
            rel = re.sub(r"^validation_results", str(data_root), rel)
            rel = re.sub(r"_stats\.csv$", ".csv.xz", rel)
            data_path = Path(rel)

        if not data_path.exists():
            raise FileNotFoundError(f"Data file not found (resolved): {data_path}")

        df = pd.read_csv(data_path)

        ts_cols = [c for c in df.columns if str(c).lower() in {"ts","time","timestamp","datetime","date","datetimes"}]
        if ts_cols:
            tc = ts_cols[0]
            df[tc] = pd.to_datetime(df[tc], errors="coerce", utc=True)
            df = df.set_index(tc).sort_index()

        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if not num_cols:
            raise ValueError(f"No numeric columns to plot in {data_path}")

        ycol = num_cols[0]
        plt.figure()
        df[ycol].plot(title=f"{results_df.loc[idx,'name']} — {ycol}")
        plt.xlabel("time" if ts_cols else "row")
        plt.ylabel(ycol)
        plt.tight_layout()
        plt.show()

        return df, data_path

if __name__ == "__main__":
    checker = ZeroNanChecker()
    res = checker.scan(test=None)  # set to None for full run


Unnamed: 0,name,all_ZoN,core_ZoN,mm_ZoN,q_ZoN,min,max,mean,std,q01,q05,q25,q50,q75,q95,q99,flag_label,file
0,Freq_2018,False,False,False,False,49.83,50.13000,49.988551,0.021301,49.940000,49.96000,49.97000,49.99000,50.00000,50.02000,50.04000,none,validation_results\EPI_ChipPress\Freq\2018_Fre...
1,Freq_2019,False,False,False,False,49.79,50.16000,49.988537,0.020415,49.940000,49.96000,49.98000,49.99000,50.00000,50.02000,50.04000,none,validation_results\EPI_ChipPress\Freq\2019_Fre...
2,Freq_2020,False,False,False,False,49.81,50.13000,49.988533,0.019694,49.940000,49.96000,49.98000,49.99000,50.00000,50.02000,50.04000,none,validation_results\EPI_ChipPress\Freq\2020_Fre...
3,Freq_2021,False,False,False,False,49.75,50.14000,49.988628,0.020425,49.940000,49.96000,49.98000,49.99000,50.00000,50.02000,50.04000,none,validation_results\EPI_ChipPress\Freq\2021_Fre...
4,Freq_2022,False,False,False,False,49.83,50.12000,49.988549,0.020896,49.930000,49.95000,49.97000,49.99000,50.00000,50.02000,50.04000,none,validation_results\EPI_ChipPress\Freq\2022_Fre...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14645,U3_RMS_fund_2024,False,False,False,False,0.00,243.03232,237.193991,3.180266,233.038798,234.03572,235.88918,237.32402,238.58334,240.18796,241.04272,none,validation_results\TEC_MV2400R\U3_RMS_fund\202...
14646,U_line_avg_2024,False,False,False,False,0.00,419.88500,410.032508,4.702562,402.803530,404.52000,407.80743,410.28070,412.39017,415.10425,416.56128,none,validation_results\TEC_MV2400R\U_line_avg\2024...
14647,U_line_avg_f_2024,False,False,False,False,0.00,419.55536,409.808778,6.700745,402.667820,404.41630,407.62872,410.10560,412.23367,414.87683,416.27840,none,validation_results\TEC_MV2400R\U_line_avg_f\20...
14648,U_phase_avg_2024,False,False,False,False,0.00,278.76743,236.744303,2.719823,232.577380,233.56544,235.45988,236.88562,238.10399,239.67280,240.51324,none,validation_results\TEC_MV2400R\U_phase_avg\202...


### Create Registry of Files that are all empty and should be removed

In [38]:
import re
import json
from pathlib import PurePath, Path
import pandas as pd

# === CONFIG ===
REPORT_CSV = r"dataset_clean/00meta_data/01all_zero_or_nan_report.csv"
# Where your real data files live (NOT the validation_results dir).
# Example structure: <DATA_ROOT>/<machine>/<measurement>/<year>_<measurement>.csv.xz
DATA_ROOT = Path(r"D:\energy_dataset")  # <-- change this

REGISTRY_CSV   = Path("dataset_clean/00meta_data/01removed_files_registry.csv")

# === LOAD & FILTER ===
val_df = pd.read_csv(REPORT_CSV)
all_ZoN_df = val_df[val_df["all_ZoN"] == True].copy()

# === HELPERS ===
year_tail_pat = re.compile(r"^(?P<meas>.+)_(?P<year>\d{4})$")

def parse_name_field(s: str):
    """
    'U12_h31_2018' -> (measurement='U12_h31', year='2018')
    Works even if measurement contains underscores, as year must be 4 digits at end.
    """
    m = year_tail_pat.match(str(s))
    if m:
        return m.group("meas"), m.group("year")
    return None, None

def parse_from_validation_path(p: str):
    """
    Example: 'validation_results\\EPI_ChipPress\\U12_h31\\2019_U12_h31_stats.csv'
    -> machine='EPI_ChipPress', measurement='U12_h31'
    We do NOT trust the filename for the year here (prefer 'name' column’s year).
    """
    parts = PurePath(p).parts
    # Find 'validation_results' and take the next two components as machine/measurement if present
    try:
        vr_idx = [i for i, part in enumerate(parts) if part.lower() == "validation_results"][0]
        machine = parts[vr_idx + 1] if len(parts) > vr_idx + 1 else None
        measurement = parts[vr_idx + 2] if len(parts) > vr_idx + 2 else None
    except (IndexError, ValueError):
        machine, measurement = None, None
    return machine, measurement

def expected_data_path(data_root: Path, machine: str, measurement: str, year: str) -> Path | None:
    """
    Build the expected data file path under your real dataset root.
    """
    if not all([data_root, machine, measurement, year]):
        return None
    # Adjust this if your naming differs (e.g., lowercase, hyphens, etc.)
    return data_root / machine / measurement / f"{year}_{measurement}.csv.xz"

# === BUILD REGISTRY ===
rows = []
for _, r in all_ZoN_df.iterrows():
    meas_from_name, year = parse_name_field(r.get("name", ""))
    machine_from_path, meas_from_path = parse_from_validation_path(r.get("file", ""))

    # Prefer path-derived machine; prefer name-derived measurement (more reliable with year parsing)
    machine = machine_from_path
    measurement = meas_from_name or meas_from_path

    # Fall back if name didn’t parse (rare)
    if (measurement is None) and (meas_from_path is not None):
        # try to remove a leading year + underscore pattern if present
        m2 = re.match(r"^\d{4}_(.+)$", meas_from_path)
        measurement = m2.group(1) if m2 else meas_from_path

    data_path = expected_data_path(DATA_ROOT, machine, measurement, year)

    rows.append({
        "machine": machine,
        "measurement": measurement,
        "year": year,
        "reason": "all_zero_or_nan",                      # stable canonical label
        "flag_label": r.get("flag_label", ""),            # keep original label for traceability
        "validation_report_file": r.get("file", ""),      # where the decision came from
        "expected_data_file": str(data_path) if data_path else None,
        "all_ZoN": True,
        # optional stats for auditability (useful in docs)
        "min": r.get("min", None),
        "max": r.get("max", None),
        "mean": r.get("mean", None),
        "std": r.get("std", None),
        "q01": r.get("q01", None),
        "q05": r.get("q05", None),
        "q25": r.get("q25", None),
        "q50": r.get("q50", None),
        "q75": r.get("q75", None),
        "q95": r.get("q95", None),
        "q99": r.get("q99", None),
    })

registry = pd.DataFrame(rows)

# Normalize / sanity checks
registry["machine"] = registry["machine"].astype("string")
registry["measurement"] = registry["measurement"].astype("string")
registry["year"] = registry["year"].astype("string")

# Drop obvious malformed rows (no machine or measurement or year)
registry = registry.dropna(subset=["machine", "measurement", "year"]).reset_index(drop=True)

# De-duplicate (idempotent reruns)
registry = registry.drop_duplicates(subset=["machine", "measurement", "year"]).reset_index(drop=True)

# === SAVE REGISTRY ===
registry.to_csv(REGISTRY_CSV, index=False)


print(f"Registry written:\n- {REGISTRY_CSV}\n")
print(f"{len(registry)} entries marked as all-zero-or-nan.")


Registry written:
- dataset_clean\00meta_data\01removed_files_registry.csv

1352 entries marked as all-zero-or-nan.


### Delete and Replace empty files with Short and REMANED Files

In [53]:
from pathlib import Path
import shutil
import pandas as pd


DATA_ROOT     = Path("dataset_clean")  # relative to current working dir
REGISTRY_CSV  = Path("dataset_clean/00meta_data/01removed_files_registry.csv")
OUTPUT_LOG    = Path("dataset_clean/00meta_data/empty_file_rewrite_log.csv")

TEST_MODE         = False     # process ONLY first eligible file; folder rename is DRY-RUN
BACKUP_ORIGINALS  = True     # True -> move original to backup; False -> hard delete
BACKUP_ROOT       = None     # None -> defaults to DATA_ROOT/../backup_empty_originals

# If your dataset always uses one extension, set to ".csv.xz" or ".csv".
# If None, the script will try both in this order: [".csv.xz", ".csv"].
PREFERRED_EXT     = None

# ====== SCRIPT =========
reg = pd.read_csv(REGISTRY_CSV)
required_cols = {"machine", "measurement", "year", "expected_data_file"}
missing = required_cols - set(reg.columns)
if missing:
    raise ValueError(f"Registry missing required columns: {missing}")

actions = []
processed_any = False

if BACKUP_ORIGINALS and BACKUP_ROOT is None:
    BACKUP_ROOT = (DATA_ROOT.parent / "backup_empty_originals").resolve()

# helper: return a viable path under DATA_ROOT, rebuilding from machine/measurement/year
def build_expected_path(machine: str, measurement: str, year: str) -> tuple[Path, bool]:
    """
    Rebuild the expected file path under DATA_ROOT regardless of what the registry contains.
    Returns (path, is_xz) where is_xz indicates whether to write compressed replacement.
    Tries PREFERRED_EXT first if provided, else tries .csv.xz then .csv.
    """
    base = (DATA_ROOT / machine / measurement)
    if PREFERRED_EXT:
        p = base / f"{year}_{measurement}{PREFERRED_EXT}"
        return p, (PREFERRED_EXT == ".csv.xz")
    # try .csv.xz then .csv
    p1 = base / f"{year}_{measurement}.csv.xz"
    if p1.exists():
        return p1, True
    p2 = base / f"{year}_{measurement}.csv"
    return p2, False

# Iterate rows
for _, row in reg.iterrows():
    machine     = str(row["machine"])
    measurement = str(row["measurement"])
    year        = str(row["year"])

    timestamp_col = "WsDateTime"
    value_col     = measurement

    # Always REBUILD the path under DATA_ROOT (ignoring absolute paths in registry)
    expected_path, write_xz = build_expected_path(machine, measurement, year)

    # Idempotency: skip if name already *_EMPTY
    name = expected_path.name
    already_empty_name = name.endswith("_EMPTY.csv") or name.endswith("_EMPTY.csv.xz")
    if already_empty_name:
        actions.append({
            "action": "skip_already_empty_naming",
            "machine": machine, "measurement": measurement, "year": year,
            "path": str(expected_path)
        })
        continue

    # If not found, also try the "other" extension (when PREFERRED_EXT is None)
    if not expected_path.exists() and PREFERRED_EXT is None:
        # flip ext
        alt_xz = not write_xz
        alt = expected_path.with_suffix(".csv" if alt_xz is False else ".xz")
        # ensure correct pair (.csv.xz requires two suffixes)
        if str(alt).endswith(".xz"):
            alt = expected_path.with_name(expected_path.stem + ".csv.xz")
        else:
            alt = expected_path.with_name(expected_path.stem + ".csv")
        if alt.exists():
            expected_path, write_xz = alt, alt.suffix.endswith("xz")

    if not expected_path.exists():
        actions.append({
            "action": "missing_original",
            "machine": machine, "measurement": measurement, "year": year,
            "path": str(expected_path)
        })
        continue

    # Compute *_EMPTY filename
    if expected_path.name.endswith(".csv.xz"):
        target_path = expected_path.with_name(expected_path.name[:-7] + "_EMPTY.csv.xz")
    elif expected_path.name.endswith(".csv"):
        target_path = expected_path.with_name(expected_path.name[:-4] + "_EMPTY.csv")
    else:
        # fallback: append _EMPTY before final suffix
        target_path = expected_path.with_name(expected_path.stem + "_EMPTY" + "".join(expected_path.suffixes))

    # === TEST MODE: only the first eligible file; folder rename is DRY-RUN
    if TEST_MODE and not processed_any:
        # backup or delete
        if BACKUP_ORIGINALS:
            backup_dst = (BACKUP_ROOT / machine / measurement / expected_path.name).resolve()
            backup_dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(expected_path), str(backup_dst))
            actions.append({
                "action": "backup_original_dry_run",
                "machine": machine, "measurement": measurement, "year": year,
                "src": str(expected_path), "dst": str(backup_dst)
            })
        else:
            expected_path.unlink()
            actions.append({
                "action": "delete_original_dry_run",
                "machine": machine, "measurement": measurement, "year": year,
                "path": str(expected_path)
            })

        # write tiny *_EMPTY file (two cols, one row)
        df = pd.DataFrame({timestamp_col: [f"{year}-01-01 00:00:00"], value_col: [0.0]})
        target_path.parent.mkdir(parents=True, exist_ok=True)
        if write_xz or target_path.suffix.endswith("xz"):
            df.to_csv(target_path, index=False, compression="xz")
        else:
            df.to_csv(target_path, index=False)
        actions.append({
            "action": "write_empty_file_dry_run",
            "machine": machine, "measurement": measurement, "year": year,
            "path": str(target_path)
        })

        # folder rename DRY-RUN
        meas_dir = target_path.parent
        csv_like = list(meas_dir.glob("*.csv")) + list(meas_dir.glob("*.csv.xz"))
        if csv_like and all(p.name.endswith("_EMPTY.csv") or p.name.endswith("_EMPTY.csv.xz") for p in csv_like):
            if not meas_dir.name.endswith("_EMPTY"):
                actions.append({
                    "action": "rename_dir_dry_run",
                    "src": str(meas_dir),
                    "dst": str(meas_dir.with_name(meas_dir.name + "_EMPTY"))
                })

        processed_any = True
        break

    # === REAL RUN (TEST_MODE=False): process all files
    if not TEST_MODE:
        if BACKUP_ORIGINALS:
            backup_dst = (BACKUP_ROOT / machine / measurement / expected_path.name).resolve()
            backup_dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(expected_path), str(backup_dst))
            actions.append({
                "action": "backup_original",
                "machine": machine, "measurement": measurement, "year": year,
                "src": str(expected_path), "dst": str(backup_dst)
            })
        else:
            expected_path.unlink()
            actions.append({
                "action": "delete_original",
                "machine": machine, "measurement": measurement, "year": year,
                "path": str(expected_path)
            })

        df = pd.DataFrame({timestamp_col: [f"{year}-01-01 00:00:00"], value_col: [0.0]})
        target_path.parent.mkdir(parents=True, exist_ok=True)
        if write_xz or target_path.suffix.endswith("xz"):
            df.to_csv(target_path, index=False, compression="xz")
        else:
            df.to_csv(target_path, index=False)
        actions.append({
            "action": "write_empty_file",
            "machine": machine, "measurement": measurement, "year": year,
            "path": str(target_path)
        })

# After processing: rename measurement folders if only *_EMPTY remain (real run only)
if not TEST_MODE:
    meas_dirs = set()
    for _, row in reg.iterrows():
        md = (DATA_ROOT / str(row["machine"]) / str(row["measurement"])).resolve()
        meas_dirs.add(md)
    for md in sorted(meas_dirs):
        if not md.is_dir():
            continue
        csv_like = list(md.glob("*.csv")) + list(md.glob("*.csv.xz"))
        if not csv_like:
            continue
        if all(p.name.endswith("_EMPTY.csv") or p.name.endswith("_EMPTY.csv.xz") for p in csv_like):
            if not md.name.endswith("_EMPTY"):
                new_dir = md.with_name(md.name + "_EMPTY")
                md.rename(new_dir)
                actions.append({"action": "rename_dir", "src": str(md), "dst": str(new_dir)})

# Log + preview
log_df = pd.DataFrame(actions)
OUTPUT_LOG.parent.mkdir(parents=True, exist_ok=True)
log_df.to_csv(OUTPUT_LOG, index=False)

display(log_df.head(20))
print(f"Total log rows: {len(log_df)}")
if TEST_MODE:
    print("TEST_MODE=True → processed only the first eligible file and dry-ran the folder rename.")
else:
    print("Real run completed: files replaced and eligible measurement folders renamed.")


Unnamed: 0,action,machine,measurement,year,path,src,dst
0,missing_original,EPI_ChipPress,U12_h31,2018,dataset_clean\EPI_ChipPress\U12_h31\2018_U12_h...,,
1,backup_original,EPI_ChipPress,U12_h31,2019,,dataset_clean\EPI_ChipPress\U12_h31\2019_U12_h...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
2,write_empty_file,EPI_ChipPress,U12_h31,2019,dataset_clean\EPI_ChipPress\U12_h31\2019_U12_h...,,
3,backup_original,EPI_ChipPress,U1_h21,2019,,dataset_clean\EPI_ChipPress\U1_h21\2019_U1_h21...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
4,write_empty_file,EPI_ChipPress,U1_h21,2019,dataset_clean\EPI_ChipPress\U1_h21\2019_U1_h21...,,
5,backup_original,EPI_ChipPress,U1_h21,2022,,dataset_clean\EPI_ChipPress\U1_h21\2022_U1_h21...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
6,write_empty_file,EPI_ChipPress,U1_h21,2022,dataset_clean\EPI_ChipPress\U1_h21\2022_U1_h21...,,
7,backup_original,EPI_ChipPress,U1_h21,2023,,dataset_clean\EPI_ChipPress\U1_h21\2023_U1_h21...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
8,write_empty_file,EPI_ChipPress,U1_h21,2023,dataset_clean\EPI_ChipPress\U1_h21\2023_U1_h21...,,
9,backup_original,EPI_ChipPress,U1_h23,2023,,dataset_clean\EPI_ChipPress\U1_h23\2023_U1_h23...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...


Total log rows: 2868
Real run completed: files replaced and eligible measurement folders renamed.


In [58]:
df = pd.read_csv(r"dataset_clean\00meta_data\empty_file_rewrite_log.csv")
df[df["action"]=="backup_original"]

Unnamed: 0,action,machine,measurement,year,path,src,dst
1,backup_original,EPI_ChipPress,U12_h31,2019.0,,dataset_clean\EPI_ChipPress\U12_h31\2019_U12_h...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
3,backup_original,EPI_ChipPress,U1_h21,2019.0,,dataset_clean\EPI_ChipPress\U1_h21\2019_U1_h21...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
5,backup_original,EPI_ChipPress,U1_h21,2022.0,,dataset_clean\EPI_ChipPress\U1_h21\2022_U1_h21...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
7,backup_original,EPI_ChipPress,U1_h21,2023.0,,dataset_clean\EPI_ChipPress\U1_h21\2023_U1_h21...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
9,backup_original,EPI_ChipPress,U1_h23,2023.0,,dataset_clean\EPI_ChipPress\U1_h23\2023_U1_h23...,D:\EnergyDataset\backup_empty_originals\EPI_Ch...
...,...,...,...,...,...,...,...
2693,backup_original,TEC_MV2400R,U1_DC,2024.0,,dataset_clean\TEC_MV2400R\U1_DC\2024_U1_DC.csv.xz,D:\EnergyDataset\backup_empty_originals\TEC_MV...
2695,backup_original,TEC_MV2400R,U23_DC,2024.0,,dataset_clean\TEC_MV2400R\U23_DC\2024_U23_DC.c...,D:\EnergyDataset\backup_empty_originals\TEC_MV...
2697,backup_original,TEC_MV2400R,U2_DC,2024.0,,dataset_clean\TEC_MV2400R\U2_DC\2024_U2_DC.csv.xz,D:\EnergyDataset\backup_empty_originals\TEC_MV...
2699,backup_original,TEC_MV2400R,U31_DC,2024.0,,dataset_clean\TEC_MV2400R\U31_DC\2024_U31_DC.c...,D:\EnergyDataset\backup_empty_originals\TEC_MV...


In [61]:
df2 = pd.read_csv(r"validation_results\EPI_ChipPress\Freq\2018_Freq_missing.csv")
with pd.option_context("display.max_columns", None):
    display(df2)

row = df2.iloc[0]

n_total   = row["n_total"]
n_nans    = row["n_nans"]
n_zeros   = row["n_zeros"]

pct_missing = (n_nans / n_total) * 100 if n_total else 0

summary = f"""
📊 **Summary for {row['file_path']}**

- Total values: {n_total:,}
- Missing (NaN): {n_nans:,} ({pct_missing:.2f}%)
- Constant zeros: {n_zeros:,}

🔹 Missing structure:
- Missing at start: {row['n_nans_start']:,}
- Missing at end:   {row['n_nans_end']:,}
- Missing in middle: {row['n_nans_middle']:,}

🔹 Gap statistics:
- Number of gaps: {row['n_missing_gaps']:,}
- Total missing duration: {row['missing_gap_total_sec']:,} sec

🔹 Gap size categories:
- 1 step: {row['nan_1_step_count']} gaps, {row['nan_1_step_steps']:,} steps ({row['nan_1_step_pct']:.2f}%)
- 5s–30s: {row['nan_5s_30s_count']} gaps, {row['nan_5s_30s_steps']:,} steps ({row['nan_5s_30s_pct']:.2f}%)
- 30s–1m: {row['nan_30s_1m_count']} gaps, {row['nan_30s_1m_steps']:,} steps ({row['nan_30s_1m_pct']:.2f}%)
- 1m–15m: {row['nan_1m_15m_count']} gaps, {row['nan_1m_15m_steps']:,} steps ({row['nan_1m_15m_pct']:.2f}%)
- 15m–1h: {row['nan_15m_1h_count']} gaps, {row['nan_15m_1h_steps']:,} steps ({row['nan_15m_1h_pct']:.2f}%)
- 1h–24h: {row['nan_1h_24h_count']} gaps, {row['nan_1h_24h_steps']:,} steps ({row['nan_1h_24h_pct']:.2f}%)
- 1d–7d:  {row['nan_1d_7d_count']} gaps, {row['nan_1d_7d_steps']:,} steps ({row['nan_1d_7d_pct']:.2f}%)
- 1w–1mo: {row['nan_1w_1mo_count']} gaps, {row['nan_1w_1mo_steps']:,} steps ({row['nan_1w_1mo_pct']:.2f}%)
- >1mo:   {row['nan_1mo_inf_count']} gaps, {row['nan_1mo_inf_steps']:,} steps ({row['nan_1mo_inf_pct']:.2f}%)
"""

print(summary)


Unnamed: 0,file_path,n_total,n_nans,n_zeros,n_nans_start,n_nans_end,n_nans_middle,n_missing_gaps,missing_gap_total_sec,nan_1_step_count,nan_1_step_steps,nan_1_step_pct,nan_5s_30s_count,nan_5s_30s_steps,nan_5s_30s_pct,nan_30s_1m_count,nan_30s_1m_steps,nan_30s_1m_pct,nan_1m_15m_count,nan_1m_15m_steps,nan_1m_15m_pct,nan_15m_1h_count,nan_15m_1h_steps,nan_15m_1h_pct,nan_1h_24h_count,nan_1h_24h_steps,nan_1h_24h_pct,nan_1d_7d_count,nan_1d_7d_steps,nan_1d_7d_pct,nan_1w_1mo_count,nan_1w_1mo_steps,nan_1w_1mo_pct,nan_1mo_inf_count,nan_1mo_inf_steps,nan_1mo_inf_pct
0,dataset_clean\EPI_ChipPress\Freq\2018_Freq.csv.xz,6307200,666965,0,0,719,666246,320844,3334820,320689,1603445,48.08,79,1090,0.03,18,820,0.02,49,7665,0.23,3,6595,0.2,4,18020,0.54,1,198470,5.95,1,1498715,44.94,0,0,0.0



📊 **Summary for dataset_clean\EPI_ChipPress\Freq\2018_Freq.csv.xz**

- Total values: 6,307,200
- Missing (NaN): 666,965 (10.57%)
- Constant zeros: 0

🔹 Missing structure:
- Missing at start: 0
- Missing at end:   719
- Missing in middle: 666,246

🔹 Gap statistics:
- Number of gaps: 320,844
- Total missing duration: 3,334,820 sec

🔹 Gap size categories:
- 1 step: 320689 gaps, 1,603,445 steps (48.08%)
- 5s–30s: 79 gaps, 1,090 steps (0.03%)
- 30s–1m: 18 gaps, 820 steps (0.02%)
- 1m–15m: 49 gaps, 7,665 steps (0.23%)
- 15m–1h: 3 gaps, 6,595 steps (0.20%)
- 1h–24h: 4 gaps, 18,020 steps (0.54%)
- 1d–7d:  1 gaps, 198,470 steps (5.95%)
- 1w–1mo: 1 gaps, 1,498,715 steps (44.94%)
- >1mo:   0 gaps, 0 steps (0.00%)

