In [None]:
# ======================================
# Cell 1: Install Required Packages
# ======================================
# Install all required libraries for preprocessing
!pip install pandas numpy openpyxl

In [None]:
# ======================================
# Cell 2: Import Required Libraries
# ======================================
# Import necessary libraries for data handling and processing
import pandas as pd
import numpy as np

In [None]:
# ======================================
# Cell 3: Load and Clean Load Data
# ======================================
# Load load dataset from GitHub repo clone or local path
load_df = pd.read_csv("../data/raw/2d_Agg_Load_Summary-17-APR-25.csv")

# Try parsing the timestamp column with seconds; fallback if that fails
try:
    load_df['Datetime'] = pd.to_datetime(
        load_df['SCED Time Stamp'].str.strip(),
        format='%m/%d/%Y %H:%M:%S')
except ValueError:
    load_df['Datetime'] = pd.to_datetime(
        load_df['SCED Time Stamp'].str.strip(),
        format='%m/%d/%Y %H:%M')

# Floor datetime to the hour level
load_df['Datetime_hour'] = load_df['Datetime'].dt.floor('H')

In [None]:
# ======================================
# Cell 4: Feature Engineering
# ======================================
# Sort data chronologically
load_df = load_df.sort_values('Datetime_hour')

# Create load ramp (hour-over-hour change in total load)
load_df['load_ramp'] = load_df['AGG LOAD SUMMARY'].diff()

# Create ratio and delta features relative to total generation
load_df['load_gen_ratio'] = load_df['AGG LOAD SUMMARY'] / load_df['SUM TELEM GEN MW']
load_df['load_minus_gen'] = load_df['AGG LOAD SUMMARY'] - load_df['SUM TELEM GEN MW']
load_df['tie_flow_pct'] = load_df['SUM TELEM DCTIE MW'] / load_df['AGG LOAD SUMMARY']

# Extract temporal features
load_df['hour'] = load_df['Datetime_hour'].dt.hour
load_df['day_of_week'] = load_df['Datetime_hour'].dt.dayofweek
load_df['is_peak_hour'] = ((load_df['hour'] >= 16) & (load_df['hour'] <= 20)).astype(int)

# Rolling averages for historical context
load_df['3hr_load_ma'] = load_df['AGG LOAD SUMMARY'].rolling(window=3).mean()
load_df['6hr_load_ma'] = load_df['AGG LOAD SUMMARY'].rolling(window=6).mean()
load_df['3hr_gen_ma'] = load_df['SUM TELEM GEN MW'].rolling(window=3).mean()

In [None]:
# ======================================
# Cell 5: Clean Final DataFrame
# ======================================
# Replace infinite values and drop rows with missing data
load_df.replace([np.inf, -np.inf], np.nan, inplace=True)
load_df.dropna(inplace=True)

# Keep only relevant columns (excluding price or external fields)
columns_to_keep = [
    'Datetime_hour', 'AGG LOAD SUMMARY', 'SUM TELEM GEN MW', 'SUM TELEM DCTIE MW',
    'load_ramp', 'load_gen_ratio', 'load_minus_gen', 'tie_flow_pct',
    'hour', 'day_of_week', 'is_peak_hour',
    '3hr_load_ma', '6hr_load_ma', '3hr_gen_ma'
]
final_df = load_df[columns_to_keep]

In [None]:
# ======================================
# Cell 6: Export Processed Data
# ======================================
# Display head of final data to confirm structure
print(final_df.head())

# Save the processed data (no price column) for modeling
final_df.to_csv("../data/processed/processed_totalenergies_load_data.csv", index=False)