## Data Preprocessing & Modelling Pipeline

In [90]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import IsolationForest
import joblib
from utils.find_root import find_project_root

In [91]:
# Retrieve the project root dynamically
project_root = find_project_root()
os.chdir(project_root)

MODEL_OUTPUT_DIR = "outputs/modelling/models/"

# Ensure output directories exist
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

#### Step 0 - Dataset Loading and Splitting

In [92]:
# Step 0.1 Load dataset
DATASET_PATH = "data/processed/historical_merged/historical_IFS_merged_201702_to_202504.csv"
df = pd.read_csv(DATASET_PATH, parse_dates=['date'], index_col='date')
df = df.asfreq('h')
df

Unnamed: 0_level_0,temperature_2m,surface_pressure,precipitation,wind_speed_10m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-02-01 00:00:00,9.6,1008.2,0.0,14.6
2017-02-01 01:00:00,9.6,1007.4,0.0,14.6
2017-02-01 02:00:00,9.9,1006.8,0.6,15.1
2017-02-01 03:00:00,10.0,1006.5,0.3,15.0
2017-02-01 04:00:00,10.2,1006.2,0.3,15.5
...,...,...,...,...
2025-04-30 19:00:00,25.1,1016.6,0.0,10.1
2025-04-30 20:00:00,23.0,1016.8,0.0,11.9
2025-04-30 21:00:00,20.7,1017.3,0.0,11.9
2025-04-30 22:00:00,19.2,1017.5,0.0,9.0


In [93]:
# Step 0.2: Define chronological splits
train_start, train_end = "2017-02-01", "2025-01-31"
val_start, val_end = "2025-02-01", "2025-03-31"
test_start, test_end = "2025-04-01", "2025-04-30"

# Step 0.3: Subset dataframe
df_train = df.loc[train_start:train_end]
df_val = df.loc[val_start:val_end]
df_test = df.loc[test_start:test_end]

print(f"Training set: {len(df_train)} rows (~{len(df_train)/24:.0f} days).")
print(f"Validation set: {len(df_val)} rows (~{len(df_val)/24:.0f}days).")
print(f"Test set: {len(df_test)} rows(~{len(df_test)/24:.0f}days).") # Optional

Training set: 70128 rows (~2922 days).
Validation set: 1416 rows (~59days).
Test set: 720 rows(~30days).


#### Step 1 – Outlier Masking with Preliminary Isolation Forest
This step prepares four key features and applies rolling transformations
tailored to each one's statistical shape. A preliminary Isolation Forest model is then
used to flag and exclude anomalous timestamps in the training set for LSTM-AE sequence training.

In [94]:
# Step 1.1: Select features (precautionary)
features = ['temperature_2m', 'surface_pressure', 'wind_speed_10m', 'precipitation']
df_train = df_train[features].copy()

In [95]:
# Set parameters for normalisation
window_60d = 1440 # rolling window size
min_periods_60d = 720 # minimum number of observations required for rolling calculations
window_12h = 12 # rolling window size for 12-hour rolling z-score
min_periods_12h = 6 # minimum number of observations required for rolling calculations for 12-hour rolling z-score
eps = 1e-6  # to avoid division by zero in rolling scaling calculations for precipitation and wind

In [96]:
# 1.2 Apply rolling z-score to temperature and surface pressure
for col in ['temperature_2m', 'surface_pressure']:
    rolling = df_train[col].rolling(window=window_60d, min_periods=min_periods_60d)
    df_train[f'{col}_z'] = (df_train[col] - rolling.mean()) / (rolling.std())

In [97]:
# Step 1.3: Apply 60-day rolling median and IQR to wind speed (NO SMOOTHING)

# Calculate rolling Q1 (25th percentile), Q3 (75th percentile), and median
wind = df_train['wind_speed_10m']
q25 = wind.rolling(window=window_60d, min_periods=min_periods_60d).quantile(0.25)
q75 = wind.rolling(window=window_60d, min_periods=min_periods_60d).quantile(0.75)
median = wind.rolling(window=window_60d, min_periods=min_periods_60d).median()

# Calculate IQR (Q3 - Q1)
iqr = q75 - q25

# Standardise wind speed using median and IQR
df_train['wind_r'] = (wind - median) / (iqr + eps)

In [98]:
# 1.4: Log-transform precipitation + 12-hour rolling z-score

# Apply log1p transformation to precipitation
df_train['precipitation_log1p'] = np.log1p(df_train['precipitation'])

# Calculate 12-hour rolling z-score on log-transformed precipitation
rolling_log = df_train['precipitation_log1p'].rolling(window=window_12h, min_periods=min_periods_12h)
df_train['precip_z_12h'] = (
    df_train['precipitation_log1p'] - rolling_log.mean()
) / (rolling_log.std() + eps)

In [99]:
# 1.5: Subset transformed columns and drop missing rows
transformed_features = ['temperature_2m_z', 'surface_pressure_z', 'wind_r', 'precip_z_12h']
df_train_transformed = df_train[transformed_features].dropna()

print(f"Rows after transformation: {len(df_train_transformed)}")

Rows after transformation: 69409


In [100]:
df_train_transformed.head()

Unnamed: 0_level_0,temperature_2m_z,surface_pressure_z,wind_r,precip_z_12h
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-03-02 23:00:00,-0.368721,-0.129419,-0.217284,0.0
2017-03-03 00:00:00,-0.397814,-0.169181,-0.237624,0.0
2017-03-03 01:00:00,-0.25048,-0.216915,-0.20297,0.0
2017-03-03 02:00:00,-0.191445,-0.272623,-0.237624,0.0
2017-03-03 03:00:00,-0.250199,-0.376226,-0.415842,0.0


In [101]:
# Step 1.6: Train Isolation Forest model
preliminary_if = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
preliminary_if.fit(df_train_transformed)

In [102]:
# Save model
joblib.dump(preliminary_if,os.path.join(MODEL_OUTPUT_DIR, "if_prelim.joblib"))

['outputs/modelling/models/if_prelim.joblib']

In [103]:
# Step 1.7: Assign scores and is_anomaly flags for training data
# NaNs in 'if_score' indicate rows skipped during IF scoring (e.g. due to rolling window gaps)
# 'is_anomaly' defaults to False for these rows, preserving clean downstream sequence logic
scores = preliminary_if.decision_function(df_train_transformed)
df_train.loc[df_train_transformed.index, 'if_score'] = scores
df_train['is_anomaly'] = df_train['if_score'] < np.percentile(scores, 3)
df_train['is_anomaly'] = df_train['is_anomaly'].fillna(False)

In [104]:
# Step 1.8: Masking IF model anomaly check
print(f"Anomalies flagged: {df_train['is_anomaly'].sum()}")
print(f'Proportion of anomalies to total: {df_train["is_anomaly"].sum() / len(df_train):.2%}') # should be roughly 3%

Anomalies flagged: 2083
Proportion of anomalies to total: 2.97%


#### Step 2 – Feature Transformation and Normalisation
**Goal**: Prepare each feature using transformations that match its statistical distribution and the requirements of the downstream models.
Isolation Forest expects short-term, scale-aware values, while LSTM-AE requires stable sequences with consistent internal structure

In [105]:
# Isolation Forest expects short-term normalised inputs (z-score or IQR-based)
# LSTM-AE expects sequences with consistent structure; apply robust per-sequence scaling later
if_features = ['temperature_2m_z', 'surface_pressure_z', 'wind_r', 'precip_z_12h']
lstm_features = ['temperature_2m', 'surface_pressure', 'wind_speed_10m', 'precipitation']

In [106]:
#  2.2 Reuse Z-Scores for Temperature and Pressure (from Step 1)
# Note: temperature_2m_z and surface_pressure_z were computed in Step 1.2
# as part of the masking Isolation Forest preprocessing.
# We reuse them here for the final IF model to ensure consistency and avoid recomputation.

# Check availability for clarity
assert 'temperature_2m_z' in df_train.columns
assert 'surface_pressure_z' in df_train.columns

In [107]:
# 2.3 Wind Speed: 3-Hour Smoothing + 60-Day IQR Scaling

# Apply 3-hour smoothing to reduce gust noise
df_train['wind_r'] = df_train['wind_speed_10m'].rolling(window=3, min_periods=1).mean()

# Compute 60-day rolling median and IQR
wind_median = df_train['wind_r'].rolling(window=1440, min_periods=720).median()
wind_iqr = df_train['wind_r'].rolling(window=1440, min_periods=720).quantile(0.75) - \
           df_train['wind_r'].rolling(window=1440, min_periods=720).quantile(0.25)

# IQR-scale the smoothed wind speed
df_train['wind_r'] = (df_train['wind_r'] - wind_median) / wind_iqr

In [108]:
# 2.4 Reuse Precipitation Transformation (from Step 1.4)
# The precip_z_12h feature has already been computed using a 12-hour rolling z-score on log1p(precipitation) in Step 1.4.

# Verify precipitation feature exists
assert 'precip_z_12h' in df_train.columns

In [109]:
# Check all required IF input columns are present
for col in if_features:
    assert col in df_train.columns, f"Missing IF input: {col}"

#### Step 3 – Time-Based Feature Engineering
**Goal**: Add cyclic temporal context to help LSTM-AE learn seasonal and daily rhythms.
We encode hour-of-day and month-of-year using sine and cosine pairs
to preserve continuity across wraparound points (e.g. 23:00 → 00:00).

In [110]:
# 3.1 Encode Hour-of-Day Cyclically
# Extract hour of day and encode as sine/cosine
df_train['hour'] = df_train.index.hour
df_train['hour_sin'] = np.sin(2 * np.pi * df_train['hour'] / 24)
df_train['hour_cos'] = np.cos(2 * np.pi * df_train['hour'] / 24)

In [111]:
# 3.2 Encode Month-of-Year Cyclically

# Extract month and encode as sine/cosine
df_train['month'] = df_train.index.month
df_train['month_sin'] = np.sin(2 * np.pi * df_train['month'] / 12)
df_train['month_cos'] = np.cos(2 * np.pi * df_train['month'] / 12)

In [112]:
# Add time features to LSTM-AE input only (do not add to IF!)
lstm_time_features = ['hour_sin', 'hour_cos', 'month_sin', 'month_cos']

#### Step 4 – LSTM-AE Sequence Construction
**Goal**: Extract clean, fixed-length sequences from the training set to train the LSTM Autoencoder.
Each sequence must contain hourly-aligned, anomaly-free data over a 30-day window (720 hours), optionally including cyclical time features.

In [113]:
# 4.1 Filter Out Anomalous Timestamps - this prevents corrupted sequences from skewing LSTM training

# Step 4.1: Filter training set to keep only non-anomalous rows
df_train_clean = df_train[df_train['is_anomaly'] == 0].copy()

In [114]:
# 4.2 Ensure Hourly Continuity

# Enforce hourly frequency to ensure timestamp alignment
df_train_clean = df_train_clean.asfreq('h')

In [115]:
# 4.3 Define Sequence Window and Stride

sequence_length = 720    # 30 days
sequence_stride = 1     # a stride of 24 speeds up training and reduces redundancy

In [119]:
# Step 4.4 (revised): Generate sliding window sequences with relaxed anomaly masking

# We'll allow up to 1% anomalies (max 7 out of 720 timestamps) in each 30-day window.
# This avoids overly strict rejection of otherwise stable sequences.

max_allowed_anomalies = 7

# Debug counters to track why windows are rejected or accepted
debug_counts = {
    'total': 0,            # Total windows attempted
    'has_nans': 0,         # Dropped due to missing values
    'has_anomalies': 0,    # Dropped due to too many anomalies
    'added': 0             # Successfully added
}

sequences = []

# Slide a 720-hour window across the training set, one hour at a time
for start in range(0, len(df_train) - sequence_length + 1, 1):
    # Extract the 30-day window (720 rows)
    window = df_train.iloc[start:start + sequence_length]
    debug_counts['total'] += 1

    # Reject the window if it contains any NaNs (e.g. from early rolling stats)
    if window.isna().any().any():
        debug_counts['has_nans'] += 1
        continue

    # Reject if more than 7 timestamps are flagged as anomalies
    if window['is_anomaly'].sum() > max_allowed_anomalies:
        debug_counts['has_anomalies'] += 1
        continue

    # If passed both filters, extract model input values and store the sequence
    sequences.append(window[lstm_input_cols_all].values)
    debug_counts['added'] += 1

# Output how many sequences passed each filter stage
print(f"Window stats: {debug_counts}")

Window stats: {'total': 69409, 'has_nans': 719, 'has_anomalies': 57829, 'added': 10861}


In [120]:
# Step 4.5: Stack into model-ready tensor
# Format: (n_sequences, 720, n_features)
X_train_lstm = np.stack(sequences)
print(f"LSTM training sequences: {X_train_lstm.shape}")

LSTM training sequences: (10861, 720, 8)


#### Important notes on tuning the sequence parameters

We began with a stride of 24 hours and strict anomaly masking, which gave only **4 valid sequences**. Reducing the stride to 1 hour increased this to **95 sequences** by allowing more overlap. However, most windows were still rejected because even one anomalous timestamp caused the sequence to be dropped.

To address this, we relaxed the masking rule by allowing up to **7 anomalies per 30-day window** (about 1%). This increased the final training set to **10,861 sequences**, or **15.6% of all possible windows**.

This approach is supported by recent studies which recommend tolerating small amounts of noise in sequence-based anomaly detection to ensure enough coverage for stable model training (Bâra et al., 2024; Trinh, 2022; Darban, 2024; Kulkarni, 2024).