## Isolation Forest + LSTM-AE Training Scaffold for COMP1884

### ✅ Notebook: Model Training Pipeline (IF + LSTM-AE)
### Author: Jeremy / Group 6

### 📦 STEP 0: Setup & Imports

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Input

In [29]:
# 📂 Set paths
os.chdir('C:/Users/jerry/Dropbox/JEREMIAH/MY_STUDIES/DATA_SCIENCE/University_of_Greenwich/Year2/COMP1884 - Group Project/CODE/COMP1884-Group6-Codebase-Complete')
RAW_DATA_PATH = "data/processed/historical_merged/historical_IFS_merged_201702_to_202504.csv"
MODEL_OUTPUT_DIR = "outputs/modelling/models/"

# ✅ Ensure output directories exist
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

### 🧹 STEP 1: Load and Inspect Data

In [5]:
print("Loading historical data...")
df = pd.read_csv(RAW_DATA_PATH, parse_dates=['date'])
df.set_index('date', inplace=True)
df = df[['temperature_2m', 'surface_pressure', 'precipitation', 'wind_speed_10m']]

Loading historical data...


In [6]:
df.head()

Unnamed: 0_level_0,temperature_2m,surface_pressure,precipitation,wind_speed_10m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-02-01 00:00:00,9.6,1008.2,0.0,14.6
2017-02-01 01:00:00,9.6,1007.4,0.0,14.6
2017-02-01 02:00:00,9.9,1006.8,0.6,15.1
2017-02-01 03:00:00,10.0,1006.5,0.3,15.0
2017-02-01 04:00:00,10.2,1006.2,0.3,15.5


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 72264 entries, 2017-02-01 00:00:00 to 2025-04-30 23:00:00
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   temperature_2m    72264 non-null  float64
 1   surface_pressure  72264 non-null  float64
 2   precipitation     72264 non-null  float64
 3   wind_speed_10m    72264 non-null  float64
dtypes: float64(4)
memory usage: 2.8 MB


### 📊 STEP 2: Compute Rolling Statistics (60-day)

In [8]:
ROLLING_WINDOW = 1440  # 60 days * 24 hours
print("Computing rolling stats...")
rolling_mean = df.rolling(window=ROLLING_WINDOW, min_periods=ROLLING_WINDOW).mean()
rolling_std = df.rolling(window=ROLLING_WINDOW, min_periods=ROLLING_WINDOW).std()

Computing rolling stats...


In [9]:
rolling_std

Unnamed: 0_level_0,temperature_2m,surface_pressure,precipitation,wind_speed_10m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-02-01 00:00:00,,,,
2017-02-01 01:00:00,,,,
2017-02-01 02:00:00,,,,
2017-02-01 03:00:00,,,,
2017-02-01 04:00:00,,,,
...,...,...,...,...
2025-04-30 19:00:00,4.939712,9.505976,0.168198,5.100376
2025-04-30 20:00:00,4.949358,9.493910,0.168198,5.099213
2025-04-30 21:00:00,4.954166,9.481387,0.168198,5.097849
2025-04-30 22:00:00,4.956146,9.468470,0.168198,5.095544


In [10]:
# Robust alternative
rolling_median = df.rolling(window=ROLLING_WINDOW, min_periods=ROLLING_WINDOW).median()
rolling_iqr = df.rolling(window=ROLLING_WINDOW, min_periods=ROLLING_WINDOW).quantile(0.75) - \
               df.rolling(window=ROLLING_WINDOW, min_periods=ROLLING_WINDOW).quantile(0.25)

In [11]:
rolling_median

Unnamed: 0_level_0,temperature_2m,surface_pressure,precipitation,wind_speed_10m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-02-01 00:00:00,,,,
2017-02-01 01:00:00,,,,
2017-02-01 02:00:00,,,,
2017-02-01 03:00:00,,,,
2017-02-01 04:00:00,,,,
...,...,...,...,...
2025-04-30 19:00:00,9.70,1016.5,0.0,9.5
2025-04-30 20:00:00,9.70,1016.5,0.0,9.5
2025-04-30 21:00:00,9.70,1016.5,0.0,9.5
2025-04-30 22:00:00,9.70,1016.5,0.0,9.5


In [12]:
# Drop NaNs from initial window burn-in
df = df.iloc[ROLLING_WINDOW:]
rolling_mean = rolling_mean.iloc[ROLLING_WINDOW:]
rolling_std = rolling_std.iloc[ROLLING_WINDOW:]
rolling_median = rolling_median.iloc[ROLLING_WINDOW:]
rolling_iqr = rolling_iqr.iloc[ROLLING_WINDOW:]
rolling_iqr = rolling_iqr.replace(0, 1e-6)

In [13]:
df

Unnamed: 0_level_0,temperature_2m,surface_pressure,precipitation,wind_speed_10m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-04-02 00:00:00,6.0,1011.1,0.0,7.2
2017-04-02 01:00:00,6.9,1011.8,0.0,7.7
2017-04-02 02:00:00,7.2,1012.4,0.0,10.9
2017-04-02 03:00:00,7.7,1013.0,0.0,10.8
2017-04-02 04:00:00,6.9,1013.5,0.0,8.6
...,...,...,...,...
2025-04-30 19:00:00,25.1,1016.6,0.0,10.1
2025-04-30 20:00:00,23.0,1016.8,0.0,11.9
2025-04-30 21:00:00,20.7,1017.3,0.0,11.9
2025-04-30 22:00:00,19.2,1017.5,0.0,9.0


### 🧮 STEP 3: Normalisation (Z-score and robust)

In [14]:
zscore = (df - rolling_mean) / rolling_std
robust = (df - rolling_median) / rolling_iqr

In [15]:
robust

Unnamed: 0_level_0,temperature_2m,surface_pressure,precipitation,wind_speed_10m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-04-02 00:00:00,-0.377358,-0.136646,0.0,-0.783715
2017-04-02 01:00:00,-0.207547,-0.093168,0.0,-0.727273
2017-04-02 02:00:00,-0.150943,-0.055901,0.0,-0.404040
2017-04-02 03:00:00,-0.056604,-0.018634,0.0,-0.414141
2017-04-02 04:00:00,-0.207547,0.009317,0.0,-0.636364
...,...,...,...,...
2025-04-30 19:00:00,2.264706,0.006098,0.0,0.085409
2025-04-30 20:00:00,1.955882,0.018377,0.0,0.341637
2025-04-30 21:00:00,1.617647,0.049080,0.0,0.341637
2025-04-30 22:00:00,1.412639,0.061350,0.0,-0.071174


In [16]:
# Use robust version for IF
X_if = robust.dropna()

In [17]:
X_if.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 70824 entries, 2017-04-02 00:00:00 to 2025-04-30 23:00:00
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   temperature_2m    70824 non-null  float64
 1   surface_pressure  70824 non-null  float64
 2   precipitation     70824 non-null  float64
 3   wind_speed_10m    70824 non-null  float64
dtypes: float64(4)
memory usage: 2.7 MB


### 🧪 STEP 4: Isolation Forest Training

In [23]:
print("Training Isolation Forest...")
iso_model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
iso_model.fit(X_if)
scores_if = -iso_model.decision_function(X_if)
threshold = np.quantile(scores_if, 0.97)

# 📌 Store valid_mask aligned to full robust index
full_scores = pd.Series(index=robust.index, dtype='float64')
full_scores.loc[X_if.index] = scores_if
valid_mask = full_scores < threshold
valid_mask = valid_mask.fillna(False)
valid_mask.index.name = 'date'

Training Isolation Forest...


In [24]:
# Save model
joblib.dump(iso_model, os.path.join(MODEL_OUTPUT_DIR, "isolation_forest.joblib"))

['outputs/modelling/models/isolation_forest.joblib']

### 🧼 STEP 5: Extract Valid LSTM Sequences

In [25]:
print("Extracting clean sequences for LSTM-AE training...")
SEQUENCE_LENGTH = 30*24  # 30 days
stride = 24  # 1 day

# Filter robust-normalised data using valid_mask
valid_df = robust[valid_mask]  # Assumes valid_mask is a pandas Series
valid_df = valid_df.sort_index().asfreq('h')  # Ensure regular hourly spacing

X_sequences = []
for i in range(0, len(valid_df) - SEQUENCE_LENGTH + 1, stride):
    seq = valid_df.iloc[i:i+SEQUENCE_LENGTH]
    if seq.isnull().values.any():
        continue  # skip windows with gaps
    X_sequences.append(seq.values)

X_sequences = np.array(X_sequences)
print("Final sequence shape:", X_sequences.shape)

Extracting clean sequences for LSTM-AE training...
Final sequence shape: (84, 720, 4)



### 🧠 STEP 6: Train LSTM Autoencoder

In [26]:
print("Training LSTM Autoencoder...")
model = Sequential([
    Input(shape=(SEQUENCE_LENGTH, 4)),
    LSTM(64, activation='relu', return_sequences=False),
    RepeatVector(SEQUENCE_LENGTH),
    LSTM(64, activation='relu', return_sequences=True),
    TimeDistributed(Dense(4))
])

model.compile(optimizer='adam', loss='mae')

# Training
model.fit(
    X_sequences, X_sequences,
    epochs=50, batch_size=16,
    validation_split=0.1,
    shuffle=False,
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]
)

# Save in updated format
model.save("outputs/modelling/models/lstm_autoencoder.keras")
print("✅ All models trained and saved.")

Training LSTM Autoencoder...
Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - loss: nan - val_loss: nan
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 525ms/step - loss: nan - val_loss: nan
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 453ms/step - loss: nan - val_loss: nan
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 453ms/step - loss: nan - val_loss: nan
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 453ms/step - loss: nan - val_loss: nan
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 457ms/step - loss: nan - val_loss: nan
✅ All models trained and saved.


### 📈 STEP 7: Inference and Anomaly Scoring

In [27]:
print("Running inference...")
reconstruction = model.predict(X_sequences)
recon_errors = np.nanmean(np.abs(reconstruction - X_sequences), axis=(1, 2))
recon_errors = np.nan_to_num(recon_errors, nan=0.0, posinf=1e6, neginf=1e6)
# Set threshold using 95th percentile of training errors
anomaly_threshold = np.percentile(recon_errors, 95)
sequence_flags = recon_errors > anomaly_threshold

print(f"Threshold: {anomaly_threshold:.4f}")
print(f"Anomalous sequences: {np.sum(sequence_flags)} / {len(sequence_flags)}")

# Save scores for later integration
np.save("outputs/modelling/predictions/lstm_sequence_errors.npy", recon_errors)
np.save("outputs/modelling/predictions/lstm_anomaly_flags.npy", sequence_flags)

Running inference...
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step
Threshold: 0.0000
Anomalous sequences: 0 / 84


  recon_errors = np.nanmean(np.abs(reconstruction - X_sequences), axis=(1, 2))
