# 01 — Prepare PV & Load (authoritative clean)
This notebook:
1) Robustly loads `data/interim/pv_5homes_10min.csv` and `data/interim/load_5homes_10min.csv`
2) Detects timestamp column and numeric columns
3) Aligns on **time only**
4) Normalizes columns to `h1..hN`
5) Detects/repairs unit mismatch (PV in kWh/10min vs Load in kW → multiply PV by 6)
6) Writes cleaned artifacts to `data/processed/` and a quick meta report to `reports/`


In [9]:
from pathlib import Path
import pandas as pd
import warnings

# silence pandas' noisy datetime warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*infer_datetime_format.*")

# ---- locate project root no matter where we run ----
CWD = Path.cwd()
ROOT = next((c for c in [CWD, CWD.parent, CWD.parent.parent] if (c / "data" / "interim").exists()), None)
if ROOT is None:
    raise FileNotFoundError("Could not find 'data/interim' from here.")

DATA_INTERIM  = ROOT / "data" / "interim"
DATA_PROCESSED= ROOT / "data" / "processed"
REPORTS       = ROOT / "reports"
for p in [DATA_PROCESSED, REPORTS]:
    p.mkdir(parents=True, exist_ok=True)

PV_FILE   = DATA_INTERIM / "pv_5homes_10min.csv"
LOAD_FILE = DATA_INTERIM / "load_5homes_10min.csv"
FREQ      = "10min"

print("ROOT =", ROOT)
print("PV_FILE  =", PV_FILE)
print("LOAD_FILE=", LOAD_FILE)


ROOT = e:\VPP
PV_FILE  = e:\VPP\data\interim\pv_5homes_10min.csv
LOAD_FILE= e:\VPP\data\interim\load_5homes_10min.csv


In [10]:
def load_numeric_timeindexed_csv(path: Path) -> pd.DataFrame:
    """
    Robust CSV loader:
      - auto-detect separator
      - auto-detect a datetime column (>=90% parseable) → set as index
      - coerce remaining columns to numeric (decimal comma tolerant)
    Returns numeric-only DataFrame indexed by datetime.
    """
    if not path.exists():
        raise FileNotFoundError(f"Missing: {path}")
    df_raw = pd.read_csv(path, sep=None, engine="python")

    # detect datetime-like column
    dt_col = None
    for c in df_raw.columns:
        parsed = pd.to_datetime(df_raw[c], errors="coerce")
        if parsed.notna().mean() >= 0.9:
            dt_col = c
            df_raw[c] = parsed
            break
    if dt_col is None:
        raise ValueError(f"{path.name}: no datetime-like column detected.")

    df = df_raw.set_index(dt_col).sort_index()

    # numeric coercion; try decimal-comma fallback if needed
    df_num = df.apply(pd.to_numeric, errors="coerce")
    if df_num.select_dtypes("number").empty and df.shape[1] > 0:
        df_num = df.replace(",", ".", regex=True).apply(pd.to_numeric, errors="coerce")
    df_num = df_num.select_dtypes("number")
    if df_num.empty:
        raise ValueError(f"{path.name}: no numeric columns after parsing.")
    return df_num


In [11]:
pv_raw   = load_numeric_timeindexed_csv(PV_FILE)
load_raw = load_numeric_timeindexed_csv(LOAD_FILE)

# align on TIME ONLY (keep columns)
common_idx = pv_raw.index.intersection(load_raw.index)
if common_idx.empty:
    raise ValueError("No overlapping timestamps between PV and Load.")
pv = pv_raw.loc[common_idx].sort_index()
ld = load_raw.loc[common_idx].sort_index()

# normalize to h1..hN
def to_home_cols(cols, prefix):
    mp = {}
    for c in cols:
        s = str(c)
        if s.startswith(prefix):
            num = s[len(prefix):]
            if num.isdigit():
                mp[c] = f"h{int(num)}"
    return mp

pv = pv.rename(columns=to_home_cols(pv.columns, "pv_"))
ld = ld.rename(columns=to_home_cols(ld.columns, "load_"))

# if names don't match, fall back to position mapping
if pv.columns.intersection(ld.columns).empty:
    n = min(pv.shape[1], ld.shape[1])
    pv = pv.iloc[:, :n].copy()
    ld = ld.iloc[:, :n].copy()
    pv.columns = [f"h{i+1}" for i in range(n)]
    ld.columns = [f"h{i+1}" for i in range(n)]

homes = pv.columns.intersection(ld.columns).tolist()
pv = pv[homes]
ld = ld[homes]

pv.head(), ld.head(), homes


(                               h1      h2      h3      h4      h5
 Datetime (UTC)                                                   
 2018-08-23 00:00:00+00:00  1.4553  0.8685  0.2131  1.3281  0.6848
 2018-08-23 00:10:00+00:00  1.2215  0.7258  0.2074  1.0968  0.5842
 2018-08-23 00:20:00+00:00  0.7083  0.4063  0.1484  0.6183  0.3586
 2018-08-23 00:30:00+00:00  0.2100  0.1199  0.0822  0.2169  0.1486
 2018-08-23 00:40:00+00:00  0.2501  0.0524  0.0092  0.1928  0.0619,
                                h1      h2      h3      h4      h5
 Datetime (UTC)                                                   
 2018-08-23 00:00:00+00:00  4.9506  4.8633  3.3314  3.0321  6.3395
 2018-08-23 00:10:00+00:00  4.9671  7.6854  3.3686  2.9578  8.8531
 2018-08-23 00:20:00+00:00  4.9438  2.8980  3.4492  2.9553  6.6892
 2018-08-23 00:30:00+00:00  4.9768  6.7631  3.2522  3.0799  6.4424
 2018-08-23 00:40:00+00:00  4.8692  4.7304  3.1914  3.2387  6.2083,
 ['h1', 'h2', 'h3', 'h4', 'h5'])

In [12]:
def pos_median(df: pd.DataFrame) -> float:
    x = df[df > 0]
    return float(x.median(numeric_only=True).median())

pv_med, ld_med = pos_median(pv), pos_median(ld)
apply_times6 = False
if pd.notna(pv_med) and pd.notna(ld_med) and pv_med > 0 and ld_med > 0:
    ratio = ld_med / pv_med
    # typical: PV was energy per 10-min (kWh/slot) vs load power (kW)
    if 5.0 <= ratio <= 7.5:
        apply_times6 = True
        pv = pv * 6.0  # convert PV to kW

print({"pv_med": pv_med, "load_med": ld_med, "x6_applied": apply_times6})


{'pv_med': 2.58225, 'load_med': 3.2796, 'x6_applied': False}


In [13]:
surplus = (pv - ld).fillna(0.0)

pv_out   = DATA_PROCESSED / "pv_kW_10min.csv"
ld_out   = DATA_PROCESSED / "load_kW_10min.csv"
sup_out  = DATA_PROCESSED / "surplus_kW_10min.csv"
meta_out = REPORTS / "prepare_meta.txt"

pv.to_csv(pv_out)
ld.to_csv(ld_out)
surplus.to_csv(sup_out)

with open(meta_out, "w", encoding="utf-8") as f:
    f.write("01_prepare_pv_load — meta\n")
    f.write(f"rows={len(pv)} homes={len(homes)} freq={FREQ}\n")
    f.write(f"unit_fix_x6_applied={apply_times6}\n")
    f.write(f"timestamp_range={pv.index.min()} .. {pv.index.max()}\n")
    f.write(f"homes={homes}\n")

print("Wrote:")
print("  ", pv_out.as_posix())
print("  ", ld_out.as_posix())
print("  ", sup_out.as_posix())
print("  ", meta_out.as_posix())


Wrote:
   e:/VPP/data/processed/pv_kW_10min.csv
   e:/VPP/data/processed/load_kW_10min.csv
   e:/VPP/data/processed/surplus_kW_10min.csv
   e:/VPP/reports/prepare_meta.txt
