In [1]:
from pathlib import Path
import pandas as pd

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

def run_cleanup(save_dir):
    print("[INFO]: Cleanup initiated...")

    """Deletes residual .parquet files"""

    parquet_files = list(Path(save_dir).glob("*.parquet"))
    if not parquet_files:
        print("No parquet files found.")
        return

    # Prefer Path.suffixes for robust “depth”
    depths = {f: len(f.suffixes) for f in parquet_files}
    max_depth = max(depths.values())

    # Remove residual Parquet files (shallower names)
    kept = []
    for f, d in depths.items():
        if d < max_depth:
            print(f"[INFO]: Removing residual parquet file {f}")
            try:
                f.unlink()
            except Exception as e:
                print(f"[WARN]: Could not remove {f}: {e}")
        else:
            kept.append(f)

    # Also clean the kept Parquet(s): drop any columns that are actual .hess blobs
    for f in kept:
        try:
            df = pd.read_parquet(f)
        except Exception as e:
            print(f"[WARN]: Could not read {f}: {e}")
            continue

        # Identify .hess columns by column name (suffix) and basic type check
        hess_cols = []
        for col in df.columns:
            if col.endswith(".hess"):
                # find one non-null value to sanity-check the type
                nonnull = next(
                    (v for v in df[col].tolist()
                     if v is not None and not (isinstance(v, float) and np.isnan(v))),
                    None
                )
                if nonnull is None or isinstance(nonnull, (bytes, bytearray, str)):
                    hess_cols.append(col)

        if hess_cols:
            print(f"[INFO]: Dropping .hess columns from {f.name}: {hess_cols}")
            df = df.drop(columns=hess_cols)
            try:
                df.to_parquet(f)  # overwrite in place
            except Exception as e:
                print(f"[WARN]: Failed writing cleaned Parquet {f}: {e}")
        else:
            print(f"[INFO]: No .hess columns in {f.name}")

    print("[INFO]: Cleanup done!")

In [5]:
df = pd.read_parquet("test_res/test_data.parquet")
novibs = [c for c in df.columns if "vibs" not in c]
df = df[novibs]

In [None]:
run_cleanup()