# UK Historic Electricity Demand - Final Data Preparation
**Author:** Abdul Salam Aldabik

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries loaded")
print(f"üìÖ Processing date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

## 1. Load Combined Dataset

In [None]:
# Load the combined dataset (created by data_pull.ipynb)
print("üìÇ Loading electricity demand data...")
df = pd.read_csv('../Data/neso_historic_demand_combined.csv')

print(f"\n‚úÖ Dataset loaded")
print(f"   Rows: {len(df):,}")
print(f"   Columns: {len(df.columns)}")
print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 2. Clean Column Names

In [None]:
# Standardize column names: lowercase, no spaces
df.columns = df.columns.str.lower().str.replace(' ', '_')

print(f"‚úÖ Column names standardized")
print(f"   Columns: {df.columns.tolist()}")

## 3. Remove Redundant Columns

In [None]:
# Drop redundant columns (nd, tsd are duplicates of england_wales_demand)
# Drop near-empty columns (interconnector flows with >90% missing data)
# Protect settlement_date and settlement_period (critical for time series)

redundant = ['nd', 'tsd']
near_empty = ['nsl_flow', 'eleclink_flow', 'viking_flow', 'greenlink_flow', 'scottish_transfer']

# Find constant columns (single value) but protect temporal columns
constant_cols = [col for col in df.columns 
                 if df[col].nunique(dropna=True) <= 1 
                 and col not in ['settlement_date', 'settlement_period']]

cols_to_drop = list(set(redundant + near_empty + constant_cols))
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

print(f"‚úÖ Dropped {len(cols_to_drop)} redundant/empty columns")
print(f"   Remaining: {len(df.columns)} columns")

## 4. Handle Missing Values

In [None]:
# Handle missing settlement_date (critical for time series - cannot be fabricated)
if 'settlement_date' in df.columns:
    missing_dates = df['settlement_date'].isna().sum()
    if missing_dates > 0:
        print(f"‚ö†Ô∏è  Dropping {missing_dates:,} rows with missing settlement_date")
        df = df.dropna(subset=['settlement_date'])
    
    # Convert to datetime
    df['settlement_date'] = pd.to_datetime(df['settlement_date'], errors='coerce')
    
    # Sort chronologically
    df = df.sort_values(['settlement_date', 'settlement_period'], ignore_index=True)
    print(f"‚úÖ Data sorted chronologically")

# Forward fill numeric columns (time series best practice)
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_cols:
    missing_before = df[col].isnull().sum()
    if missing_before > 0:
        df[col] = df[col].ffill().bfill()  # Forward then backward fill
        print(f"   {col}: filled {missing_before:,} missing values")

print(f"‚úÖ Missing values handled")

## 5. Handle Outliers

In [None]:
# Cap outliers at 0.5th and 99.5th percentiles (keeps extreme values but removes statistical outliers)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    if col not in ['settlement_period', 'year', 'month', 'day']:  # Don't cap temporal features
        lower = df[col].quantile(0.005)
        upper = df[col].quantile(0.995)
        df[col] = df[col].clip(lower=lower, upper=upper)

print(f"‚úÖ Outliers capped for {len(numeric_cols)} numeric columns")

## 6. Feature Engineering - Temporal Features

In [None]:
# Extract comprehensive temporal features from settlement_date
if 'settlement_date' in df.columns:
    df['settlement_date'] = pd.to_datetime(df['settlement_date'])
    
    # Year-level features
    df['year'] = df['settlement_date'].dt.year
    df['quarter'] = df['settlement_date'].dt.quarter
    
    # Month and day features
    df['month'] = df['settlement_date'].dt.month
    df['day'] = df['settlement_date'].dt.day
    df['day_of_year'] = df['settlement_date'].dt.dayofyear
    df['week_of_year'] = df['settlement_date'].dt.isocalendar().week
    
    # Weekday features
    df['day_of_week'] = df['settlement_date'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Hour from settlement_period (1-48 half-hourly periods)
    df['hour'] = (df['settlement_period'] - 1) * 0.5
    
    temporal_features = ['year', 'quarter', 'month', 'day', 'day_of_week', 
                        'day_of_year', 'week_of_year', 'is_weekend', 'hour']
    
    print(f"‚úÖ Created {len(temporal_features)} temporal features")
    for feat in ['year', 'month', 'hour']:
        print(f"   {feat}: {df[feat].min()} to {df[feat].max()}")

## 7. Remove Highly Correlated Features

In [None]:
# Remove features with correlation >0.95 (but protect temporal features)
temporal_cols = ['settlement_date', 'settlement_period', 'year', 'quarter', 'month', 
                'day', 'day_of_week', 'day_of_year', 'week_of_year', 'is_weekend', 'hour']

numeric_df = df.select_dtypes(include=['float64', 'int64'])
corr_matrix = numeric_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop_corr = [column for column in upper.columns 
                if any(upper[column] > 0.95) 
                and column not in temporal_cols]

df = df.drop(columns=to_drop_corr)
print(f"‚úÖ Dropped {len(to_drop_corr)} highly correlated columns")
print(f"   Final columns: {len(df.columns)}")

## 8. Final Validation & Save

In [None]:
print("\n" + "="*70)
print("FINAL DATASET SUMMARY")
print("="*70)

print(f"\nShape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

# Verify date completeness
if 'settlement_date' in df.columns:
    print(f"\nDate range: {df['settlement_date'].min()} to {df['settlement_date'].max()}")
    print(f"Unique dates: {df['settlement_date'].nunique():,}")
    print(f"Missing dates: {df['settlement_date'].isna().sum()}")

# Verify no missing values
missing_total = df.isnull().sum().sum()
print(f"\nTotal missing values: {missing_total}")

if missing_total == 0:
    print("‚úÖ Dataset ready for modeling!")
else:
    print(f"‚ö†Ô∏è  Still have {missing_total} missing values")

# List key features
temporal_features = [col for col in df.columns if col in temporal_cols]
demand_features = [col for col in df.columns if 'demand' in col.lower() or 'generation' in col.lower()]

print(f"\nTemporal features ({len(temporal_features)}): {temporal_features}")
print(f"Demand/generation features ({len(demand_features)}): {demand_features}")

# Save cleaned dataset
output_path = '../Data/cleaned_and_augmented_electricity_data.csv'
df.to_csv(output_path, index=False)

print(f"\n{'='*70}")
print(f"‚úÖ SAVED: {output_path}")
print(f"{'='*70}")

## Summary

**Data Preparation Complete:**
- ‚úÖ Loaded and combined all years (2001-2025)
- ‚úÖ Cleaned column names
- ‚úÖ Removed redundant/empty columns
- ‚úÖ Handled missing values with forward fill
- ‚úÖ Capped outliers
- ‚úÖ Created 9 temporal features
- ‚úÖ Removed multicollinear features
- ‚úÖ Saved clean dataset

**Next Steps:**
- Run model training notebooks (07_complete_model_training.ipynb)
- Deploy models via Streamlit (streamlit_app.py)