# Data Preperation for modelling in Pytorch

In [1]:
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load data
df = pd.read_parquet("../../data/processed/data_combined.parquet")

In [3]:
y = df["actual_load_mw"].to_frame()
X = df.drop(columns=["actual_load_mw"])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.33, random_state=12345)


X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=12345)

## Intermediate Outlier check
Checking for outliers is important to help choose the most appropriate scaler.

In [4]:
def iqr_outlier_stats(X):
    stats = {}
    for col in X.columns:
        q1 = X[col].quantile(0.25)
        q3 = X[col].quantile(0.75)
        iqr = q3 - q1
        if iqr == 0:
            stats[col] = 0.0
            continue
        mask = (X[col] < q1 - 1.5 * iqr) | (X[col] > q3 + 1.5 * iqr)
        stats[col] = mask.mean()
    return pd.Series(stats).sort_values(ascending=False)

outlier_frac = iqr_outlier_stats(X_train)
outlier_frac


tp            0.107470
ssrd          0.047656
wind_speed    0.025069
t2m           0.000000
dtype: float64

In [5]:
outlier_frac = iqr_outlier_stats(y_train)
outlier_frac

actual_load_mw    0.0
dtype: float64

Seems like the total precipitation has a 10% outlier rate. This would suggest using a robust scaler.

## Scale Features

In [6]:
X_scaler = RobustScaler()
y_scaler = RobustScaler()

# Fit scalers on training data only
X_train_scaled = X_scaler.fit_transform(X_train)
y_train_scaled = y_scaler.fit_transform(y_train).squeeze()

# Transform validation and test sets using fitted scalers
X_val_scaled = X_scaler.transform(X_val)
X_test_scaled = X_scaler.transform(X_test)

y_val_scaled = y_scaler.transform(y_val).squeeze()
y_test_scaled = y_scaler.transform(y_test).squeeze()

print(f"Scaled shapes - Train: {X_train_scaled.shape}, Val: {X_val_scaled.shape}, Test: {X_test_scaled.shape}")

Scaled shapes - Train: (164828, 4), Val: (40592, 4), Test: (40592, 4)


## Windowing

Create sequences for time series forecasting


In [7]:
def create_sequences(X, y, window_size=4*24*7, forecast_horizon=4*24):
    """
    Create sequences for time series forecasting.
    
    Args:
        X: Feature array (n_samples, n_features)
        y: Target array (n_samples,)
        window_size: Number of time steps to look back
        forecast_horizon: Number of time steps to forecast ahead
        
    Returns:
        X_seq: Array of shape (n_sequences, window_size, n_features)
        y_seq: Array of shape (n_sequences, forecast_horizon)
    """
    n_samples = len(X)
    
    X_seq = []
    y_seq = []
    
    for i in range(n_samples - window_size - forecast_horizon + 1):
        X_seq.append(X[i:i+window_size])
        y_seq.append(y[i+window_size:i+window_size+forecast_horizon])
    
    return np.array(X_seq), np.array(y_seq)

# Configuration
WINDOW_SIZE = 4*24*7  # Look back the last 7 days
FORECAST_HORIZON = 4*24  # Forecast 96 steps ahead

# Create sequences for each split
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, WINDOW_SIZE, FORECAST_HORIZON)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val_scaled, WINDOW_SIZE, FORECAST_HORIZON)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, WINDOW_SIZE, FORECAST_HORIZON)

print(f"Train sequences: {X_train_seq.shape} -> {y_train_seq.shape}")
print(f"Val sequences:   {X_val_seq.shape} -> {y_val_seq.shape}")
print(f"Test sequences:  {X_test_seq.shape} -> {y_test_seq.shape}")


Train sequences: (164061, 672, 4) -> (164061, 96)
Val sequences:   (39825, 672, 4) -> (39825, 96)
Test sequences:  (39825, 672, 4) -> (39825, 96)


## Save Data and Artifacts


In [8]:
# Create output directory
output_dir = Path("../../data/processed/ml_data")
output_dir.mkdir(parents=True, exist_ok=True)
scalers_dir = output_dir / "scalers"
scalers_dir.mkdir(exist_ok=True)

# Save numpy arrays
np.save(output_dir / "X_train.npy", X_train_seq)
np.save(output_dir / "y_train.npy", y_train_seq)
np.save(output_dir / "X_val.npy", X_val_seq)
np.save(output_dir / "y_val.npy", y_val_seq)
np.save(output_dir / "X_test.npy", X_test_seq)
np.save(output_dir / "y_test.npy", y_test_seq)

# Save scalers
with open(scalers_dir / "X_scaler.pkl", "wb") as f:
    pickle.dump(X_scaler, f)
with open(scalers_dir / "y_scaler.pkl", "wb") as f:
    pickle.dump(y_scaler, f)

# Save metadata
meta = {
    "window_size": WINDOW_SIZE,
    "forecast_horizon": FORECAST_HORIZON,
    "n_features": X_train_scaled.shape[1],
    "feature_names": list(X.columns),
    "train_samples": len(X_train_seq),
    "val_samples": len(X_val_seq),
    "test_samples": len(X_test_seq),
    "train_date_range": {
        "start": str(X_train.index.min()),
        "end": str(X_train.index.max())
    },
    "val_date_range": {
        "start": str(X_val.index.min()),
        "end": str(X_val.index.max())
    },
    "test_date_range": {
        "start": str(X_test.index.min()),
        "end": str(X_test.index.max())
    },
    "scaler_type": "RobustScaler",
    "split_ratio": {
        "train": 0.7,
        "val": 0.15,
        "test": 0.15
    }
}

with open(output_dir / "meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print(f"Saved data to {output_dir}")


Saved data to ..\..\data\processed\ml_data
