In [3]:
import pandas as pd
import os
import warnings

# Suppress minor warnings for a clean pipeline run
warnings.filterwarnings('ignore', category=FutureWarning)

# --- 1. FILE PATHS ---
# Input file name confirmed in earlier steps
input_parquet_file = '../data/electricity/processed/demanddata_combined.parquet' 
final_parquet_file_path = '../data/electricity/processed/demand_model_ready.parquet'

print("--- Starting Final Data Preparation for Electricity Demand ---")

# --- 2. LOAD DATA ---
print("1. Loading combined data...")
try:
    df = pd.read_parquet(input_parquet_file)
except FileNotFoundError:
    print(f"\nERROR: Input file not found at {input_parquet_file}. Please check path and run 01/02 notebooks.")
    exit()

# --- 3. CORE CLEANING & TYPE FIXES ---

# A. Column Renaming (Snake Case)
df.columns = df.columns.str.lower().str.replace(' ', '_')
print(f"  - Columns renamed to snake_case. Initial shape: {df.shape}")

# B. Type Corrections (Crucial Fixes from EDA)
# 1. Convert 'settlement_period' to integer for datetime calculation
df['settlement_period'] = df['settlement_period'].astype(int)
# 2. Convert the target column 'nd' to numeric (float), coercing any non-numeric strings to NaN.
df['nd'] = pd.to_numeric(df['nd'], errors='coerce') 
print("  - 'settlement_period' and 'nd' coerced to numeric types.")

# --- 4. TIME-SERIES FEATURE ENGINEERING ---

# C. Create Accurate Datetime Index
df['settlement_date'] = pd.to_datetime(df['settlement_date'])
df['time_offset'] = pd.to_timedelta((df['settlement_period'] - 1) * 30, unit='m')
df['datetime'] = df['settlement_date'] + df['time_offset']

df.set_index('datetime', inplace=True)
df.sort_index(inplace=True)
df.drop(columns=['settlement_date', 'settlement_period', 'time_offset'], inplace=True)
print("  - Datetime Index created and set.")


# D. Extract Temporal Features
df['year'] = df.index.year
df['month'] = df.index.month
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['day_name'] = df.index.day_name()

print("  - Extracted year, month, hour, day_of_week, and is_weekend.")

# --- 5. DATA QUALITY & ENCODING ---

# E. Missing Value Imputation (Linear Interpolation)
# This handles both initial NaNs and those created by pd.to_numeric(errors='coerce')
cols_to_impute = df.columns[df.isnull().any()].tolist() # Get all columns with NaNs

if cols_to_impute:
    # Use time-series linear interpolation
    df[cols_to_impute] = df[cols_to_impute].interpolate(method='time', axis=0)
    # Fill any remaining NaNs at the start/end (very rare, but safe)
    df[cols_to_impute] = df[cols_to_impute].ffill().bfill() 
    print(f"  - Applied time-series interpolation to {len(cols_to_impute)} columns.")
else:
    print("  - No missing values found for imputation.")


# F. Categorical Encoding (One-Hot Encoding)
# One-hot encode the day_name feature
df = pd.get_dummies(df, columns=['day_name'], drop_first=True, prefix='day')
print("  - Applied One-Hot Encoding to 'day_name'.")
# Drop the simple day_of_week column as its information is now encoded
df.drop(columns=['day_of_week'], inplace=True)

print(f"\nFinal number of columns: {len(df.columns)}")

# --- 6. FINAL SAVE ---
print("\n2. Saving final model-ready file to Parquet...")
# Ensure the directory exists
os.makedirs(os.path.dirname(final_parquet_file_path), exist_ok=True) 

# Save the final file with the index (datetime) intact
df.to_parquet(final_parquet_file_path, index=True) 

print("\n--- Final Prep Complete! ---")
print(f"Model-ready data saved to: {final_parquet_file_path}")
print(f"Final DataFrame shape: {df.shape}")

--- Starting Final Data Preparation for Electricity Demand ---
1. Loading combined data...
  - Columns renamed to snake_case. Initial shape: (434590, 22)
  - 'settlement_period' and 'nd' coerced to numeric types.
  - Datetime Index created and set.
  - Extracted year, month, hour, day_of_week, and is_weekend.
  - Applied time-series interpolation to 15 columns.
  - Applied One-Hot Encoding to 'day_name'.

Final number of columns: 30

2. Saving final model-ready file to Parquet...

--- Final Prep Complete! ---
Model-ready data saved to: ../data/electricity/processed/demand_model_ready.parquet
Final DataFrame shape: (434590, 30)
