## Data Preprocessing & Modelling Pipeline

In [234]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import IsolationForest
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import joblib
from utils.find_root import find_project_root

In [229]:
# Retrieve the project root dynamically
project_root = find_project_root()
os.chdir(project_root)

# Define output directories for ML models
MODEL_OUTPUT_DIR = "outputs/modelling/models/"
MODEL_INPIT_OUTPUT_DIR = "data/processed/model_input"

# Ensure output directories exist
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_INPIT_OUTPUT_DIR, exist_ok=True)

#### Step 0 - Dataset Loading and Splitting

In [193]:
# Step 0.1 Load dataset
DATASET_PATH = "data/processed/historical_merged/historical_IFS_merged_201702_to_202504.csv"
df = pd.read_csv(DATASET_PATH, parse_dates=['date'], index_col='date')
df = df.asfreq('h')
df

Unnamed: 0_level_0,temperature_2m,surface_pressure,precipitation,wind_speed_10m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-02-01 00:00:00,9.6,1008.2,0.0,14.6
2017-02-01 01:00:00,9.6,1007.4,0.0,14.6
2017-02-01 02:00:00,9.9,1006.8,0.6,15.1
2017-02-01 03:00:00,10.0,1006.5,0.3,15.0
2017-02-01 04:00:00,10.2,1006.2,0.3,15.5
...,...,...,...,...
2025-04-30 19:00:00,25.1,1016.6,0.0,10.1
2025-04-30 20:00:00,23.0,1016.8,0.0,11.9
2025-04-30 21:00:00,20.7,1017.3,0.0,11.9
2025-04-30 22:00:00,19.2,1017.5,0.0,9.0


In [194]:
# Step 0.2: Define chronological splits
train_start, train_end = "2017-02-01", "2025-01-31"
val_start, val_end = "2025-02-01", "2025-04-30"

# Step 0.3: Subset dataframe
df_train = df.loc[train_start:train_end]
df_val = df.loc[val_start:val_end]

print(f"Training set: {len(df_train)} rows (~{len(df_train)/24:.0f} days).")
print(f"Validation set: {len(df_val)} rows (~{len(df_val)/24:.0f}days).")

Training set: 70128 rows (~2922 days).
Validation set: 2136 rows (~89days).


#### Step 1 – Feature Transformation & Normalisation
**Goal**: Transform raw weather variables into scale-stable, model-ready features using rolling statistics. This includes z-scores, IQR scaling, smoothing, and log transforms. These steps are applied before training the Isolation Forest and LSTM-AE models to ensure consistency across training, validation, and inference.

In [198]:
# Step 1.1: Select features (precautionary)
features = ['temperature_2m', 'surface_pressure', 'wind_speed_10m', 'precipitation']
df_train = df_train[features].copy()
df_val =df_val[features].copy()

In [199]:
# Step 1.2: Set parameters for normalisation
window_60d = 1440 # rolling window size
min_periods_60d = 720 # minimum number of observations required for rolling calculations
window_12h = 12 # rolling window size for 12-hour rolling z-score
min_periods_12h = 6 # minimum number of observations required for rolling calculations for 12-hour rolling z-score
eps = 1e-6  # to avoid division by zero in rolling scaling calculations for precipitation and wind

In [200]:
# Step 1.3 Temperature and Surface Pressure: 60-day rolling z-scores
for col in ['temperature_2m', 'surface_pressure']:
    # Train set
    mean_train = df_train[col].rolling(window=window_60d, min_periods=min_periods_60d).mean()
    std_train = df_train[col].rolling(window=window_60d, min_periods=min_periods_60d).std()
    df_train[f'{col}_z'] = (df_train[col] - mean_train) / (std_train + eps)

    # Validation set
    mean_val = df_val[col].rolling(window=window_60d, min_periods=min_periods_60d).mean()
    std_val = df_val[col].rolling(window=window_60d, min_periods=min_periods_60d).std()
    df_val[f'{col}_z'] = (df_val[col] - mean_val) / (std_val + eps)

In [201]:
# Step 1.4: Apply smoothing to wind speed (NO Z-SCORE)
df_train['wind_r'] = df_train['wind_speed_10m'].rolling(window=3, min_periods=1).mean()
df_val['wind_r']   = df_val['wind_speed_10m'].rolling(window=3, min_periods=1).mean()

In [202]:
# Step 1.5: Apply IQR scaling to smoothed Wind Speed
for df_ in [df_train, df_val]:
    med = df_['wind_r'].rolling(window=window_60d, min_periods=min_periods_60d).median()
    q75 = df_['wind_r'].rolling(window=window_60d, min_periods=min_periods_60d).quantile(0.75)
    q25 = df_['wind_r'].rolling(window=window_60d, min_periods=min_periods_60d).quantile(0.25)
    iqr = q75 - q25
    df_['wind_r'] = (df_['wind_r'] - med) / (iqr + eps)

In [203]:
# Step 1.6: Tranform Precipitation with log1p
# Log transformation reduces skew

df_train['precip_log'] = np.log1p(df_train['precipitation'])
df_val['precip_log']   = np.log1p(df_val['precipitation'])


In [204]:
# Step 1.7: Apply 12-hour z-score to logged Precipitation
# short rolling window captures bursts

mean_train = df_train['precip_log'].rolling(window=window_12h, min_periods=min_periods_12h).mean()
std_train  = df_train['precip_log'].rolling(window=window_12h, min_periods=min_periods_12h).std()
df_train['precip_z_12h'] = (df_train['precip_log'] - mean_train) / (std_train + eps)

mean_val = df_val['precip_log'].rolling(window=window_12h, min_periods=min_periods_12h).mean()
std_val  = df_val['precip_log'].rolling(window=window_12h, min_periods=min_periods_12h).std()
df_val['precip_z_12h'] = (df_val['precip_log'] - mean_val) / (std_val + eps)

#### Step 2 – Time-Based Feature Engineering
**Goal**: Add cyclic temporal context to help LSTM-AE learn seasonal and daily rhythms.
We encode hour-of-day and month-of-year using sine and cosine pairs
to preserve continuity across wraparound points (e.g. 23:00 → 00:00).

In [205]:
# Step 2.1: Encode Hour-of-Day Cyclically
# Extract hour of day and encode as sine/cosine
df_train['hour'] = df_train.index.hour
df_train['hour_sin'] = np.sin(2 * np.pi * df_train['hour'] / 24)
df_train['hour_cos'] = np.cos(2 * np.pi * df_train['hour'] / 24)

df_val['hour'] = df_val.index.hour
df_val['hour_sin'] = np.sin(2 * np.pi * df_val['hour'] / 24)
df_val['hour_cos'] = np.cos(2 * np.pi * df_val['hour'] / 24)

In [206]:
# Step 2.2: Encode Month-of-Year Cyclically

# Extract month and encode as sine/cosine
df_train['month'] = df_train.index.month
df_train['month_sin'] = np.sin(2 * np.pi * df_train['month'] / 12)
df_train['month_cos'] = np.cos(2 * np.pi * df_train['month'] / 12)

df_val['month'] = df_val.index.month
df_val['month_sin'] = np.sin(2 * np.pi * df_val['month'] / 12)
df_val['month_cos'] = np.cos(2 * np.pi * df_val['month'] / 12)

In [207]:
# Step 2.3: Define LSTM input features
# These include raw weather variables (not transformed) and time features
lstm_features = ['temperature_2m', 'surface_pressure', 'wind_speed_10m', 'precipitation']
lstm_time_features = ['hour_sin', 'hour_cos', 'month_sin', 'month_cos']

#### Step 3 – Isolation Forest Training and Anomaly Scoring
**Goal**: Train a single Isolation Forest model using the transformed features in if_features.
Use the model to compute anomaly scores for both the training and validation sets.
Later, apply different thresholds to these scores for masking and inference.

In [208]:
# Step 3.1: Define IF model features
if_features = ['temperature_2m_z', 'surface_pressure_z', 'wind_r', 'precip_z_12h']

In [209]:
# Step 3.2: Drop rows with NaNs in the input features for model training
X_train_if = df_train[if_features].dropna()

In [210]:
# Step 3.3: Train Isolation Forest model
if_model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
if_model.fit(X_train_if)

In [211]:
# Step 3.4: Save model
joblib.dump(preliminary_if,os.path.join(MODEL_OUTPUT_DIR, "if_model.joblib"))

['outputs/modelling/models/if_model.joblib']

In [212]:
# Step 3.5: Apply the trained IF model to compute anomaly scores
# Scores: higher = more normal, lower = more anomalous

df_train_transformed = df_train[if_features].dropna()
df_val_transformed = df_val[if_features].dropna()

# Score both sets
scores_train = if_model.decision_function(df_train_transformed)
scores_val   = if_model.decision_function(df_val_transformed)

# Store scores in their respective DataFrames
df_train.loc[df_train_transformed.index, 'if_score'] = scores_train
df_val.loc[df_val_transformed.index, 'if_score'] = scores_val

#### Step 4 – Anomaly Masking via Percentile Thresholds
**Goal**: Convert continuous IF anomaly scores into binary anomaly flags using different percentile thresholds:

- A strict threshold (e.g. 1%) for masking LSTM training data

- A more relaxed threshold (e.g. 3%) for validation sequences and inference consistency
These labels are only used for sequence filtering, not for model training.

In [213]:
# Step 4.1: Define percentile thresholds for anomaly masking
mask_threshold_train = 1  # Top 1% most anomalous points in training
mask_threshold_val = 3    # Top 3% most anomalous points in validation

In [214]:
# Step 4.2: Apply Thresholds and Assign Flags

# Training set masking (strict)
threshold_train = np.percentile(scores_train, mask_threshold_train)
df_train['is_anomaly'] = df_train['if_score'] < threshold_train
df_train['is_anomaly'] = df_train['is_anomaly'].fillna(False)

# Validation set masking (relaxed)
threshold_val = np.percentile(scores_val, mask_threshold_val)
df_val['is_anomaly'] = df_val['if_score'] < threshold_val
df_val['is_anomaly'] = df_val['is_anomaly'].fillna(False)


In [215]:
# Step 4.3: Sanity check: How many anomalies were flagged in training and validation?

train_anomaly_counts = df_train['is_anomaly'].value_counts()
val_anomaly_counts = df_val['is_anomaly'].value_counts()

# Absolute counts
print("Training Set Anomaly Counts:")
print(train_anomaly_counts)

print("\nValidation Set Anomaly Counts:")
print(val_anomaly_counts)

# Proportions (percent of total)
train_anomaly_ratio = df_train['is_anomaly'].mean()
val_anomaly_ratio = df_val['is_anomaly'].mean()

print(f"\nProportion of anomalies in training set: {train_anomaly_ratio:.4f} ({train_anomaly_ratio*100:.2f}%)")
print(f"Proportion of anomalies in validation set: {val_anomaly_ratio:.4f} ({val_anomaly_ratio*100:.2f}%)")


Training Set Anomaly Counts:
is_anomaly
False    69433
True       695
Name: count, dtype: int64

Validation Set Anomaly Counts:
is_anomaly
False    2093
True       43
Name: count, dtype: int64

Proportion of anomalies in training set: 0.0099 (0.99%)
Proportion of anomalies in validation set: 0.0201 (2.01%)


These anomaly flags will be used to filter LSTM sequences in Steps 5 and 6.
They are not used directly in training, only for selecting “healthy” sequence windows.
This strategy is consistent with semi-supervised pseudo-labelling in literature:

- Darban (2024)
- Trinh (2022)
- Antwarg (2021)

#### Step 5 – LSTM-AE Sequence Construction (Training)
**Goal**: Extract 30-day (720-hour) sequences from the training set for the LSTM Autoencoder.
Sequences must contain no NaNs and no more than 14 anomalous points (~2% of the sequence) to preserve training stability while retaining data coverage.

In [220]:
# Step 5.1: Define Parameters and Initialise

# Sequence parameters
sequence_length = 720        # 30 days of hourly data
sequence_stride = 1          # Slide 1 hour at a time
max_allowed_anomalies = 14   # ~2% tolerance per sequence

# Features for LSTM-AE
lstm_input_cols_all = lstm_features + lstm_time_features

# Initialise counters and sequence store
debug_counts = {
    'total': 0,
    'has_nans': 0,
    'has_anomalies': 0,
    'added': 0
}
train_sequences = []

In [221]:
# Step 5.2: Construct Sequences from df_train

# Slide window across full training set
for start in range(0, len(df_train) - sequence_length + 1, sequence_stride):
    window = df_train.iloc[start:start + sequence_length]
    debug_counts['total'] += 1

    # Skip if any NaNs (e.g. due to rolling stats)
    if window.isna().any().any():
        debug_counts['has_nans'] += 1
        continue

    # Skip if anomaly count exceeds threshold
    if window['is_anomaly'].sum() > max_allowed_anomalies:
        debug_counts['has_anomalies'] += 1
        continue

    # If passed all checks, keep sequence
    train_sequences.append(window[lstm_input_cols_all].values)
    debug_counts['added'] += 1

# Report stats
print("Training sequence construction complete.")
print(f"Window stats: {debug_counts}")

Training sequence construction complete.
Window stats: {'total': 69409, 'has_nans': 719, 'has_anomalies': 9453, 'added': 59237}


**Anomaly Threshold Experiment Summary (Training Set)**

We tested how many training sequences could be created with different limits on how many anomalies are allowed per 720-hour (30-day) window.

The number of sequences with NaNs at 719 is fully justified by the 720-hour minimum window affecting the first 719 hours due to rolling stats.

With a maximum of 14 anomalies per sequence (around 2%), we kept over 59,000 training windows — about 87% of all usable sequences.

Reducing the limit to 7 anomalies (around 1%) led to 44,000 valid sequences — a 25% drop. This shows how stricter filtering reduces training data volume.

Studies by Darban (2024), Kulkarni et al. (2024), and Trinh (2022) support allowing 1–3% anomaly presence in training. It improves generalisation and avoids underfitting by keeping slight noise and variation. Based on this, we chose 14 anomalies as a balanced threshold for robust LSTM-AE training.

In [223]:
# Step 4.5: Stack into model-ready tensor
# Format: (n_sequences, 720, n_features)

X_train_lstm = np.stack(train_sequences)
print(f"LSTM training sequences: {X_train_lstm.shape}")

LSTM training sequences: (59237, 720, 8)


#### Step 6 – LSTM-AE Sequence Construction (Validation)
**Goal**: Extract 720-hour sequences from the validation set using the same feature columns, stride, and relaxed masking as in training.
This set is used for threshold tuning, early stopping, and SHAP analysis.

In [224]:
# Step 6.1: Define Parameters and Initialise

val_sequences = []
val_debug = {
    'total': 0,
    'has_nans': 0,
    'has_anomalies': 0,
    'added': 0
}

In [225]:
# Step 6.2: Slide window across validation set

for start in range(0, len(df_val) - sequence_length + 1, sequence_stride):
    window = df_val.iloc[start:start + sequence_length]
    val_debug['total'] += 1

    if window.isna().any().any():
        val_debug['has_nans'] += 1
        continue

    if window['is_anomaly'].sum() > max_allowed_anomalies:
        val_debug['has_anomalies'] += 1
        continue

    val_sequences.append(window[lstm_input_cols_all].values)
    val_debug['added'] += 1

print("Validation sequence construction complete.")
print(f"Validation window stats: {val_debug}")


Validation sequence construction complete.
Validation window stats: {'total': 1417, 'has_nans': 719, 'has_anomalies': 636, 'added': 62}


In [226]:
# Step 6.3: Final shape: (n_val_sequences, 720, n_features)
X_val_lstm = np.stack(val_sequences)
print(f"LSTM validation sequences: {X_val_lstm.shape}")

LSTM validation sequences: (62, 720, 8)


#### Step 7 – Save Processed Sequences and Transformed Data
**Goal**: Persist key outputs including preprocessed training and validation dataframes, and their corresponding LSTM-AE tensors.
This allows reuse for model training, threshold tuning, and interpretability tasks without re-running all preprocessing steps.

In [230]:
# Save transformed DataFrames (optional but useful for SHAP/debugging)
df_train.to_csv(os.path.join(MODEL_INPIT_OUTPUT_DIR, 'df_train_preprocessed.csv'))
df_val.to_csv(os.path.join(MODEL_INPIT_OUTPUT_DIR, 'df_val_preprocessed.csv'))

# Save LSTM-AE ready sequences
np.save(os.path.join(MODEL_INPIT_OUTPUT_DIR, 'X_train_lstm.npy'), X_train_lstm)
np.save(os.path.join(MODEL_INPIT_OUTPUT_DIR, 'X_val_lstm.npy'), X_val_lstm)

#### Step 8 – Train the LSTM Autoencoder
**Goal**: Train an LSTM-based autoencoder on the 720-hour sequences (X_train_lstm) to learn normal weather patterns.
During evaluation, sequences that produce high reconstruction error will be considered anomalous.

In [233]:
# Step 8.1: Define the LSTM-AE Architecture

# Get sequence and feature dimensions
timesteps = X_train_lstm.shape[1]  # e.g. 720
n_features = X_train_lstm.shape[2] # e.g. 8

# Define model
input_layer = Input(shape=(timesteps, n_features))

# Encoder
encoded = LSTM(64, activation='tanh', return_sequences=True)(input_layer)
encoded = LSTM(32, activation='tanh', return_sequences=False)(encoded)

# Bottleneck
bottleneck = RepeatVector(timesteps)(encoded)

# Decoder
decoded = LSTM(32, activation='tanh', return_sequences=True)(bottleneck)
decoded = LSTM(64, activation='tanh', return_sequences=True)(decoded)
decoded = TimeDistributed(Dense(n_features))(decoded)

# Assemble
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mae')

# Print summary
autoencoder.summary()

#### Step 9 – Train the LSTM Autoencoder with Early Stopping <br>
**Goal**: Train the LSTM-AE to reconstruct normal sequences using MAE loss.
Early stopping prevents overfitting by halting training when validation loss no longer improves.

**Training Configuration Summary**

We trained the LSTM Autoencoder using the following settings:

Loss = 'mae' (Mean Absolute Error):
MAE is used to measure how closely the model can reconstruct each value in a sequence. It is more robust to outliers than MSE and works well for threshold-based anomaly detection (Trinh, 2022; Darban, 2024).

Batch size = 32:
This is a typical batch size for time-series data. It allows the model to learn temporal dependencies while maintaining training efficiency (Kulkarni et al., 2024).

Epochs = 100 with early stopping (patience = 5):
Early stopping halts training when the validation loss stops improving. This prevents overfitting and saves resources (Bâra et al., 2024).

ModelCheckpoint:
We saved the best-performing model (based on validation loss) to a .h5 file for reuse. This avoids retraining and supports downstream analysis (Antwarg et al., 2021).

Shuffle = True:
Shuffling training sequences prevents the model from overfitting to local trends and improves generalisation (Darban, 2024).

In [235]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model_checkpoint = ModelCheckpoint(
    filepath='outputs/modelling/models/lstm_ae_best.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Train the model
history = autoencoder.fit(
    X_train_lstm, X_train_lstm,
    epochs=100,
    batch_size=32,
    validation_data=(X_val_lstm, X_val_lstm),
    callbacks=[early_stopping, model_checkpoint],
    shuffle=True,
    verbose=2
)

# Plot training history
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("MAE Loss")
plt.title("LSTM-AE Training Loss")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Epoch 1/100


KeyboardInterrupt: 