In [1]:
import pandas as pd
import numpy as np

# Create 10 days of hourly data
date_rng = pd.date_range(start="2025-01-01", end="2025-01-10 23:00", freq="H")

# Make synthetic power consumption with daily cycles + some noise
np.random.seed(42)
power_values = 500 + 200 * np.sin(2 * np.pi * date_rng.hour / 24) + np.random.normal(0, 30, len(date_rng))

# Combine into DataFrame
df_sample = pd.DataFrame({
    "timestamp": date_rng,
    "power": power_values
})

# Save to CSV
sample_path = "sample_power_data.csv"
df_sample.to_csv(sample_path, index=False)

print(f"✅ Sample dataset saved as {sample_path}")
df_sample.head()


✅ Sample dataset saved as sample_power_data.csv


  date_rng = pd.date_range(start="2025-01-01", end="2025-01-10 23:00", freq="H")


Unnamed: 0,timestamp,power
0,2025-01-01 00:00:00,514.901425
1,2025-01-01 01:00:00,547.61588
2,2025-01-01 02:00:00,619.430656
3,2025-01-01 03:00:00,687.112252
4,2025-01-01 04:00:00,666.18048


In [2]:
metrics = run_pipeline(
    data_path="sample_power_data.csv",
    ts_col="timestamp",
    y_col="power",
    resample="H",
    add_calendar=True,
    use_rf=True,
    outdir="outputs/test_run"
)

print(json.dumps(metrics, indent=2))


NameError: name 'run_pipeline' is not defined

In [2]:
import h5py

file_path = r"A:\AI in Engg\Project\ukdale.h5"
with h5py.File(file_path, "r") as f:
    print("Top-level keys:", list(f.keys()))


Top-level keys: ['building1', 'building2', 'building3', 'building4', 'building5']


In [3]:
with h5py.File(file_path, 'r') as f:
    print("Datasets in building1:", list(f['building1'].keys()))


Datasets in building1: ['elec']


In [10]:
import h5py

file_path = r"A:\AI in Engg\Project\ukdale.h5"

with h5py.File(file_path, "r") as f:
    print("Top-level keys:", list(f.keys()))


Top-level keys: ['building1', 'building2', 'building3', 'building4', 'building5']


In [11]:
with h5py.File(file_path, "r") as f:
    print("Datasets in building1:", list(f["building1"].keys()))
    print("Datasets in building1/elec:", list(f["building1"]["elec"].keys()))


Datasets in building1: ['elec']
Datasets in building1/elec: ['meter1', 'meter10', 'meter11', 'meter12', 'meter13', 'meter14', 'meter15', 'meter16', 'meter17', 'meter18', 'meter19', 'meter2', 'meter20', 'meter21', 'meter22', 'meter23', 'meter24', 'meter25', 'meter26', 'meter27', 'meter28', 'meter29', 'meter3', 'meter30', 'meter31', 'meter32', 'meter33', 'meter34', 'meter35', 'meter36', 'meter37', 'meter38', 'meter39', 'meter4', 'meter40', 'meter41', 'meter42', 'meter43', 'meter44', 'meter45', 'meter46', 'meter47', 'meter48', 'meter49', 'meter5', 'meter50', 'meter51', 'meter52', 'meter53', 'meter54', 'meter6', 'meter7', 'meter8', 'meter9']


In [12]:
import h5py

file_path = r"A:\AI in Engg\Project\ukdale.h5"

mains_candidates = []
with h5py.File(file_path, "r") as f:
    elec = f["building1"]["elec"]
    for key in elec.keys():  # e.g., 'meter1', 'meter2', ...
        grp = elec[key]
        attrs = dict(grp.attrs)
        is_site = int(attrs.get("site_meter", 0)) == 1  # mains flag
        if is_site:
            mains_candidates.append(key)
        print(f"{key:>7}  attrs={attrs}")

print("\nMains-like meters (site_meter=1):", mains_candidates)


 meter1  attrs={'CLASS': np.bytes_(b'GROUP'), 'TITLE': np.bytes_(b''), 'VERSION': np.bytes_(b'1.0'), 'data_columns': np.bytes_(b'(lp1\n.'), 'encoding': np.bytes_(b'N.'), 'index_cols': np.bytes_(b"(lp1\n(I0\nS'index'\np2\ntp3\na."), 'info': np.bytes_(b"(dp1\nI1\n(dp2\nS'type'\np3\nS'MultiIndex'\np4\nsS'names'\np5\n(lp6\nS'physical_quantity'\np7\nag3\nassS'index'\np8\n(dp9\nS'tz'\np10\ncpytz\n_p\np11\n(S'Europe/London'\np12\nI-60\nI0\nS'LMT'\np13\ntRp14\nss."), 'levels': np.int64(1), 'metadata': np.bytes_(b'(lp1\n.'), 'nan_rep': np.bytes_(b'nan'), 'non_index_axes': np.bytes_(b"(lp1\n(I1\n(lp2\n(S'power'\np3\nS'apparent'\np4\ntp5\natp6\na."), 'pandas_type': np.bytes_(b'frame_table'), 'pandas_version': np.bytes_(b'0.15.2'), 'table_type': np.bytes_(b'appendable_frame'), 'values_cols': np.bytes_(b"(lp1\nS'values_block_0'\np2\na.")}
meter10  attrs={'CLASS': np.bytes_(b'GROUP'), 'TITLE': np.bytes_(b''), 'VERSION': np.bytes_(b'1.0'), 'data_columns': np.bytes_(b'(lp1\n.'), 'encoding': np.bytes_(

In [13]:
import pandas as pd

file_path = r"A:\AI in Engg\Project\ukdale.h5"

with pd.HDFStore(file_path, mode="r") as store:
    keys = [k for k in store.keys() if k.startswith("/building1/elec/meter")]
    print("Found meter tables:", len(keys))
    print(keys[:15], " ...")
    # Peek the first one to see column structure
    sample = store[keys[0]]
    print("Sample columns:", sample.columns)
    print("Index tz info:", getattr(sample.index, "tz", None))


Found meter tables: 54
['/building1/elec/meter1', '/building1/elec/meter10', '/building1/elec/meter11', '/building1/elec/meter12', '/building1/elec/meter13', '/building1/elec/meter14', '/building1/elec/meter15', '/building1/elec/meter16', '/building1/elec/meter17', '/building1/elec/meter18', '/building1/elec/meter19', '/building1/elec/meter2', '/building1/elec/meter20', '/building1/elec/meter21', '/building1/elec/meter22']  ...
Sample columns: MultiIndex([('power', 'apparent')],
           names=['physical_quantity', 'type'])
Index tz info: Europe/London


In [14]:
import pandas as pd

def read_meter_series(h5_path: str, meter_key: str) -> pd.Series:
    df = pd.read_hdf(h5_path, meter_key)
    # Prefer active; fallback to apparent
    if ("power","active") in df.columns:
        s = df[("power","active")]
    else:
        s = df[("power","apparent")]
    s = s.dropna().sort_index()
    return s


In [15]:
import warnings

h5 = r"A:\AI in Engg\Project\ukdale.h5"

mains_key, stats = None, []
with pd.HDFStore(h5, "r") as store:
    keys = [k for k in store.keys() if k.startswith("/building1/elec/meter")]
    for k in keys:
        try:
            s = read_meter_series(h5, k)
            m = s.resample("1min").mean().mean()
            stats.append((k, float(m)))
        except Exception as e:
            warnings.warn(f"Skip {k}: {e}")

stats.sort(key=lambda x: x[1], reverse=True)
print("Top candidates:", stats[:5])
mains_key = stats[0][0]
print("Chosen mains_key:", mains_key)


Top candidates: [('/building1/elec/meter22', 557.4097290039062), ('/building1/elec/meter41', 409.7539978027344), ('/building1/elec/meter1', 388.5457763671875), ('/building1/elec/meter54', 342.5327453613281), ('/building1/elec/meter39', 112.47698211669922)]
Chosen mains_key: /building1/elec/meter22


In [16]:
s = read_meter_series(h5, mains_key)

# 1-min average, convert tz→UTC→naive for a clean CSV
s1 = s.resample("1min").mean()
s1 = s1.tz_convert("UTC").tz_localize(None)

df = s1.reset_index()
df.columns = ["timestamp", "power"]
csv_path = r"A:\AI in Engg\Project\ukdale_house1.csv"
df.to_csv(csv_path, index=False)
print("✅ Saved CSV:", csv_path)
print(df.head())


✅ Saved CSV: A:\AI in Engg\Project\ukdale_house1.csv
            timestamp        power
0 2013-02-14 20:57:00   337.375000
1 2013-02-14 20:58:00  1876.599976
2 2013-02-14 20:59:00  1974.800049
3 2013-02-14 21:00:00  1976.400024
4 2013-02-14 21:01:00  1949.699951


In [17]:
metrics = run_pipeline(
    data_path=r"A:\AI in Engg\Project\ukdale_house1.csv",
    ts_col="timestamp",
    y_col="power",
    resample="H",
    add_calendar=True,
    use_rf=True,
    outdir="outputs/ukdale_baseline",
)
import json; print(json.dumps(metrics, indent=2))


NameError: name 'run_pipeline' is not defined