In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# -------------------- settings --------------------
DATA_PATH = "/content/time_series_dataset.csv"   # <-- (provided uploaded file)
OUT_PATH = "/mnt/data/predictions_with_intervals.csv"
LOOKBACK = 24            # number of past timesteps used to predict next
BATCH_SIZE = 32
EPOCHS = 30
MC_SAMPLES = 200         # number of stochastic forward passes for MC Dropout
TEST_SPLIT = 0.2
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# -------------------- load data --------------------
df = pd.read_csv(DATA_PATH)
# expect columns: time, feature1, feature2, feature3, target
print("columns:", df.columns.tolist())
features = df.drop(columns=["time", "target"], errors="ignore").columns.tolist()
target_col = "target"

# -------------------- create sequences --------------------
def create_sequences(data_df, feature_cols, target_col, lookback):
    X, y = [], []
    for i in range(len(data_df) - lookback):
        X.append(data_df[feature_cols].iloc[i:i+lookback].values)
        y.append(data_df[target_col].iloc[i+lookback])
    return np.array(X), np.array(y)

# train/test split by time (no shuffle)
n_total = len(df)
n_test = int(n_total * TEST_SPLIT)
n_train = n_total - n_test
train_df = df.iloc[:n_train].reset_index(drop=True)
test_df = df.iloc[n_train - LOOKBACK:].reset_index(drop=True)  # include lookback overlap to build first test seq

# scale features and target separately
feat_scaler = StandardScaler()
tgt_scaler = StandardScaler()

feat_scaler.fit(train_df[features])
tgt_scaler.fit(train_df[[target_col]])

train_df_scaled = train_df.copy()
train_df_scaled[features] = feat_scaler.transform(train_df[features])
train_df_scaled[target_col] = tgt_scaler.transform(train_df[[target_col]])

test_df_scaled = test_df.copy()
test_df_scaled[features] = feat_scaler.transform(test_df[features])
test_df_scaled[target_col] = tgt_scaler.transform(test_df[[target_col]])

X_train, y_train = create_sequences(train_df_scaled, features, target_col, LOOKBACK)
X_test, y_test = create_sequences(test_df_scaled, features, target_col, LOOKBACK)
print("X_train.shape, y_train.shape:", X_train.shape, y_train.shape)
print("X_test.shape, y_test.shape:", X_test.shape, y_test.shape)

# -------------------- model with dropout for MC sampling --------------------
def build_mc_lstm(input_shape, dropout_rate=0.2, lstm_units=64):
    """
    Simple LSTM model with dropout layers. We will use Monte-Carlo dropout
    by calling model(x, training=True) at prediction time to keep dropout active.
    """
    inputs = keras.Input(shape=input_shape)
    x = layers.LSTM(lstm_units, return_sequences=False)(inputs)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(1)(x)  # regression output
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
                  loss="mse",
                  metrics=["mae"])
    return model

input_shape = X_train.shape[1:]  # (timesteps, features)
model = build_mc_lstm(input_shape, dropout_rate=0.2, lstm_units=64)
model.summary()

# -------------------- training --------------------
es = keras.callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[es],
    verbose=2,
    shuffle=False
)

# -------------------- deterministic baseline prediction (for comparison) --------------------
y_pred_det = model.predict(X_test, batch_size=BATCH_SIZE).squeeze()
# inverse scale
y_pred_det_inv = tgt_scaler.inverse_transform(y_pred_det.reshape(-1,1)).ravel()
y_test_inv = tgt_scaler.inverse_transform(y_test.reshape(-1,1)).ravel()

rmse_det = np.sqrt(mean_squared_error(y_test_inv, y_pred_det_inv))
mae_det = mean_absolute_error(y_test_inv, y_pred_det_inv)
print(f"Deterministic test RMSE: {rmse_det:.4f}, MAE: {mae_det:.4f}")

# -------------------- Monte Carlo Dropout predictions --------------------
# We'll run many forward passes with dropout active to obtain predictive distribution.
mc_preds = np.zeros((MC_SAMPLES, X_test.shape[0]))

for i in range(MC_SAMPLES):
    # call model with training=True to enable dropout at inference (stochastic forward pass)
    preds = model(X_test, training=True).numpy().squeeze()
    mc_preds[i] = preds

# Convert back to original scale
mc_preds_inv = tgt_scaler.inverse_transform(mc_preds.T).T  # shape: (MC_SAMPLES, n_test)

# Compute statistics
mean_pred = np.mean(mc_preds_inv, axis=0)
p2_5 = np.percentile(mc_preds_inv, 2.5, axis=0)
p10 = np.percentile(mc_preds_inv, 10, axis=0)
p90 = np.percentile(mc_preds_inv, 90, axis=0)
p97_5 = np.percentile(mc_preds_inv, 97.5, axis=0)

# Evaluate mean prediction
rmse_mc = np.sqrt(mean_squared_error(y_test_inv, mean_pred))
mae_mc = mean_absolute_error(y_test_inv, mean_pred)
print(f"MC Dropout mean RMSE: {rmse_mc:.4f}, MAE: {mae_mc:.4f}")

# -------------------- prepare output dataframe --------------------
# Align test times: the test_df started at index (n_train - LOOKBACK), and we created len(X_test) sequences
# We'll take time index from test_df corresponding to prediction times
prediction_times = test_df["time"].iloc[LOOKBACK:].reset_index(drop=True)

out_df = pd.DataFrame({
    "time": prediction_times,
    "y_true": y_test_inv,
    "pred_mean": mean_pred,
    "pred_p2_5": p2_5,
    "pred_p10": p10,
    "pred_p90": p90,
    "pred_p97_5": p97_5
})

# Add additional columns if desired: width of intervals
out_df["interval_80_width"] = out_df["pred_p90"] - out_df["pred_p10"]
out_df["interval_95_width"] = out_df["pred_p97_5"] - out_df["pred_p2_5"]

# Create the output directory if it doesn't exist
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
out_df.to_csv(OUT_PATH, index=False)
print("Saved probabilistic forecasts to:", OUT_PATH)

# -------------------- quick checks --------------------
# Coverage check: fraction of true values within intervals
coverage_80 = np.mean((out_df["y_true"] >= out_df["pred_p10"]) & (out_df["y_true"] <= out_df["pred_p90"]))
coverage_95 = np.mean((out_df["y_true"] >= out_df["pred_p2_5"]) & (out_df["y_true"] <= out_df["pred_p97_5"]))
print(f"Empirical coverage: 80% interval -> {coverage_80:.3f}, 95% interval -> {coverage_95:.3f}")

# -------------------- done --------------------


columns: ['time', 'feature1', 'feature2', 'feature3', 'target']
X_train.shape, y_train.shape: (776, 24, 3) (776,)
X_test.shape, y_test.shape: (200, 24, 3) (200,)


Epoch 1/30
22/22 - 3s - 140ms/step - loss: 0.8859 - mae: 0.7576 - val_loss: 0.7331 - val_mae: 0.7139
Epoch 2/30
22/22 - 0s - 15ms/step - loss: 0.3741 - mae: 0.4915 - val_loss: 0.2473 - val_mae: 0.4130
Epoch 3/30
22/22 - 0s - 16ms/step - loss: 0.2518 - mae: 0.4045 - val_loss: 0.1827 - val_mae: 0.3611
Epoch 4/30
22/22 - 0s - 15ms/step - loss: 0.2040 - mae: 0.3614 - val_loss: 0.1395 - val_mae: 0.3077
Epoch 5/30
22/22 - 0s - 16ms/step - loss: 0.1741 - mae: 0.3308 - val_loss: 0.1150 - val_mae: 0.2874
Epoch 6/30
22/22 - 0s - 15ms/step - loss: 0.1826 - mae: 0.3417 - val_loss: 0.1647 - val_mae: 0.3176
Epoch 7/30
22/22 - 0s - 15ms/step - loss: 0.1224 - mae: 0.2817 - val_loss: 0.0933 - val_mae: 0.2538
Epoch 8/30
22/22 - 0s - 15ms/step - loss: 0.1333 - mae: 0.2908 - val_loss: 0.0845 - val_mae: 0.2364
Epoch 9/30
22/22 - 0s - 15ms/step - loss: 0.1270 - mae: 0.2825 - val_loss: 0.0767 - val_mae: 0.2269
Epoch 10/30
22/22 - 0s - 15ms/step - loss: 0.1274 - mae: 0.2808 - val_loss: 0.0654 - val_mae: 0.208

In [3]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Try to import keras-tuner (different package names)
try:
    import keras_tuner as kt
except Exception:
    try:
        # Ensure keras-tuner is installed. If not, install it.
        %pip install keras-tuner --quiet
        import keras_tuner as kt  # alias
    except Exception:
        raise ImportError("Please install keras-tuner (pip install keras-tuner) before running this script.")

# -------------------- Settings --------------------
DATA_PATH = "/content/time_series_dataset.csv"  # uploaded dataset path (use as URL in other tools)
OUT_DIR = "/mnt/data"
os.makedirs(OUT_DIR, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

MAX_TRIALS = 25        # tuner trials (reduce/increase depending on compute)
EXECUTION_PER_TRIAL = 1
EPOCHS = 40
BATCH_SIZE = 32
TEST_SPLIT = 0.2

# -------------------- Utility functions --------------------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def create_sequences(df, feature_cols, target_col, lookback):
    X, y = [], []
    for i in range(len(df) - lookback):
        X.append(df[feature_cols].iloc[i:i+lookback].values)
        y.append(df[target_col].iloc[i+lookback])
    return np.array(X), np.array(y)

def save_report(text):
    with open(os.path.join(OUT_DIR, "report.txt"), "a", encoding="utf-8") as f:
        f.write(text + "\n")

# -------------------- Load and document dataset --------------------
df = pd.read_csv(DATA_PATH)
start_time = datetime.utcnow().isoformat()
report_header = f"Attention-LSTM Pipeline Report\nStarted: {start_time} UTC\nData source (local path): {DATA_PATH}\n\n"
open(os.path.join(OUT_DIR, "report.txt"), "w").write(report_header)

print("Dataset columns:", df.columns.tolist())
save_report("Dataset columns: " + ", ".join(df.columns.tolist()))
save_report(f"Dataset shape: {df.shape}")
save_report("First 5 rows:\n" + df.head().to_string())

# Expect columns: time, feature1, feature2, feature3, target (robust to extra columns)
if "time" not in df.columns:
    df.insert(0, "time", np.arange(len(df)))

# -------------------- Dataset characteristics --------------------
# Quick stats
desc = df.describe().to_string()
save_report("\nDataset descriptive statistics:\n" + desc)

# -------------------- Prepare train/test split (time-based) --------------------
feature_cols = [c for c in df.columns if c not in ("time", "target")]
target_col = "target"

n_total = len(df)
n_test = int(n_total * TEST_SPLIT)
n_train = n_total - n_test
train_df = df.iloc[:n_train].reset_index(drop=True)
test_df = df.iloc[n_train - 100:].reset_index(drop=True)  # keep lookback overlap margin; tuner will choose lookback

save_report(f"\nTrain rows: {len(train_df)}, Test rows (with overlap): {len(test_df)}")

# -------------------- Scaling --------------------
feat_scaler = StandardScaler().fit(train_df[feature_cols])
tgt_scaler = StandardScaler().fit(train_df[[target_col]])

train_scaled = train_df.copy()
train_scaled[feature_cols] = feat_scaler.transform(train_df[feature_cols])
train_scaled[target_col] = tgt_scaler.transform(train_df[[target_col]])

test_scaled = test_df.copy()
test_scaled[feature_cols] = feat_scaler.transform(test_df[feature_cols])
test_scaled[target_col] = tgt_scaler.transform(test_df[[target_col]])

# -------------------- Model builders --------------------
def build_attention_lstm_model(hp, input_shape):
    """
    Build Attention-LSTM model for Keras-Tuner.
    Architecture:
      - LSTM (return_sequences=True)
      - (Optional) a second LSTM (return_sequences=True) - controlled by hp
      - MultiHeadAttention applied where query=value=LSTM outputs
      - GlobalAveragePooling or flatten -> Dense -> output
    We also create a model wrapper that exposes the attention layer object for later inspection.
    """
    inputs = keras.Input(shape=input_shape, name="inputs")
    x = inputs

    lstm_units = hp.Int("lstm_units", min_value=16, max_value=128, step=16, default=64)
    return_seq = True
    x = layers.LSTM(lstm_units, return_sequences=return_seq, name="lstm_1")(x)
    if hp.Boolean("use_second_lstm", default=False):
        lstm_units2 = hp.Int("lstm_units_2", 8, 128, step=8, default=32)
        x = layers.LSTM(lstm_units2, return_sequences=True, name="lstm_2")(x)

    dropout_rate = hp.Float("dropout", 0.0, 0.5, step=0.1, default=0.2)
    x = layers.Dropout(dropout_rate, name="dropout")(x)

    # Attention params
    att_heads = hp.Int("att_heads", 1, 8, step=1, default=2)
    att_key_dim = hp.Int("att_key_dim", 8, 64, step=8, default=16)

    # MultiHeadAttention: query=key=value = x (self-attention on sequence)
    # We'll set return_attention_scores=True during a separate call to obtain att scores.
    mha = layers.MultiHeadAttention(num_heads=att_heads, key_dim=att_key_dim, name="mha")
    att_out = mha(query=x, value=x, key=x)  # shape: (batch, timesteps, key_dim * num_heads)
    # Optionally add residual connection and normalization
    x = layers.Add(name="res_add")([x, att_out])
    x = layers.LayerNormalization(name="att_layernorm")(x)

    # Pool across time (or use attention pooling)
    pooling = hp.Choice("pooling", ["avg", "flatten", "last"], default="avg")
    if pooling == "avg":
        x = layers.GlobalAveragePooling1D(name="gap")(x)
    elif pooling == "last":
        x = layers.Lambda(lambda z: z[:, -1, :], name="last_timestep")(x)
    else:
        x = layers.Flatten(name="flatten")(x)

    dense_units = hp.Int("dense_units", 8, 128, step=8, default=32)
    x = layers.Dense(dense_units, activation="relu", name="dense")(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(1, name="output")(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name="Attention_LSTM")
    lr = hp.Float("lr", 1e-4, 1e-2, sampling="log", default=1e-3)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss="mse", metrics=["mae"])
    # attach the mha layer object for later use (non-serializable property, but we will rebuild when needed)
    model.mha_layer = mha
    return model

def build_baseline_lstm(input_shape, lstm_units=64, dropout_rate=0.2, dense_units=32, lr=1e-3):
    inputs = keras.Input(shape=input_shape, name="inputs")
    x = layers.LSTM(lstm_units, return_sequences=False, name="lstm_baseline")(inputs)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(dense_units, activation="relu")(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs=inputs, outputs=outputs, name="Baseline_LSTM")
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss="mse", metrics=["mae"])
    return model

# -------------------- Keras Tuner setup --------------------
def tuner_search(train_scaled, feature_cols, target_col, max_trials=MAX_TRIALS):
    # We'll allow tuner to choose LOOKBACK as well by passing lookback via hp in outer wrapper
    def model_builder(hp):
        # hp for lookback is outer; but Keras Tuner expects model input shape fixed - so we'll set lookback externally.
        # To use tuner-chosen lookback, we create a closure value after tuner chooses a lookback via HyperParameters
        lookback = hp.Int("lookback", 12, 72, step=12, default=24)
        # Build sequences using this lookback - but tuner cannot change training data shape between trials easily.
        # To keep tuner compatible, we will implement a strategy: choose a lookback BEFORE tuner search by sampling a small set.
        # However, simpler: run tuner for fixed lookback (selected below), then do a separate tuner run if you want different lookbacks.
        raise RuntimeError("model_builder should not be called directly in this wrapper")

    # Simpler approach (robust): perform tuner search for a fixed lookback value chosen here.
    # You can repeat tuner_search with different LOOKBACK values to scan that hyperparameter too.
    LOOKBACK = 24  # default; you can change this to 12/24/36/48 before running tuner_search
    # Prepare sequences once
    X_train_full, y_train_full = create_sequences(train_scaled, feature_cols, target_col, LOOKBACK)
    print("Tuner will search with LOOKBACK =", LOOKBACK, "X_train_full.shape=", X_train_full.shape)

    def kt_model_builder(hp):
        return build_attention_lstm_model(hp, input_shape=X_train_full.shape[1:])

    tuner = kt.RandomSearch(
        kt_model_builder,
        objective=kt.Objective("val_loss", direction="min"),
        max_trials=max_trials,
        executions_per_trial=EXECUTION_PER_TRIAL,
        directory=os.path.join(OUT_DIR, "kt_dir"),
        project_name="att_lstm_search",
        overwrite=True,
        seed=RANDOM_SEED,
    )

    stop_early = keras.callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)
    tuner.search(X_train_full, y_train_full, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1, callbacks=[stop_early], verbose=2)
    best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
    best_model = tuner.get_best_models(num_models=1)[0]
    # Save best model (weights)
    best_model.save(os.path.join(OUT_DIR, "attention_lstm_best.h5"))
    return best_model, best_hp, LOOKBACK

# -------------------- Run tuner (Attention-LSTM) --------------------
save_report("\n\n=== Hyperparameter tuning for Attention-LSTM ===")
print("Starting hyperparameter search (this may take some time)...")
best_model, best_hp, chosen_lookback = tuner_search(train_scaled, feature_cols, target_col, max_trials=MAX_TRIALS)

save_report(f"Chosen LOOKBACK (used for tuner run): {chosen_lookback}")
save_report("Best hyperparameters found:")
for k in best_hp.values.keys():
    save_report(f"  {k}: {best_hp.get(k)}")

# -------------------- Prepare final train/test sequences using chosen_lookback --------------------
LOOKBACK = chosen_lookback
# Recreate train/test splits that align with lookback
train_df_final = df.iloc[:n_train].reset_index(drop=True)
test_df_final = df.iloc[n_train - LOOKBACK:].reset_index(drop=True)

train_scaled_final = train_df_final.copy()
train_scaled_final[feature_cols] = feat_scaler.transform(train_df_final[feature_cols])
train_scaled_final[target_col] = tgt_scaler.transform(train_df_final[[target_col]])

test_scaled_final = test_df_final.copy()
test_scaled_final[feature_cols] = feat_scaler.transform(test_df_final[feature_cols])
test_scaled_final[target_col] = tgt_scaler.transform(test_df_final[[target_col]])

X_train, y_train = create_sequences(train_scaled_final, feature_cols, target_col, LOOKBACK)
X_test, y_test = create_sequences(test_scaled_final, feature_cols, target_col, LOOKBACK)

print("Final shapes -> X_train:", X_train.shape, "X_test:", X_test.shape)

# -------------------- Rebuild best Attention-LSTM architecture with best_hp and re-train on full train set --------------------
def build_model_from_hp(hp, input_shape):
    # Reuse builder but return compiled model
    model = build_attention_lstm_model(hp, input_shape)
    return model

att_model = build_model_from_hp(best_hp, input_shape=X_train.shape[1:])
# print model summary
att_model.summary()

callbacks = [keras.callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)]
history_att = att_model.fit(X_train, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, verbose=2)

# Save model
att_model.save(os.path.join(OUT_DIR, "attention_lstm_trained.h5"))

# -------------------- Baseline LSTM training --------------------
# We'll set baseline hyperparams similar to best_hp for fairness
baseline_units = int(best_hp.get("lstm_units") if "lstm_units" in best_hp.values else 64)
baseline_dropout = float(best_hp.get("dropout") if "dropout" in best_hp.values else 0.2)
baseline_lr = float(best_hp.get("lr") if "lr" in best_hp.values else 1e-3)
baseline_model = build_baseline_lstm(input_shape=X_train.shape[1:], lstm_units=baseline_units, dropout_rate=baseline_dropout, dense_units=32, lr=baseline_lr)
baseline_model.summary()
history_base = baseline_model.fit(X_train, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, verbose=2)
baseline_model.save(os.path.join(OUT_DIR, "baseline_lstm.h5"))

# -------------------- Predictions, inverse-scaling and evaluation --------------------
y_pred_att = att_model.predict(X_test, batch_size=BATCH_SIZE).squeeze()
y_pred_base = baseline_model.predict(X_test, batch_size=BATCH_SIZE).squeeze()

# Inverse scale
y_pred_att_inv = tgt_scaler.inverse_transform(y_pred_att.reshape(-1,1)).ravel()
y_pred_base_inv = tgt_scaler.inverse_transform(y_pred_base.reshape(-1,1)).ravel()
y_test_inv = tgt_scaler.inverse_transform(y_test.reshape(-1,1)).ravel()

# Metrics
metrics = {}
metrics["attention_rmse"] = rmse(y_test_inv, y_pred_att_inv)
metrics["attention_mae"] = mean_absolute_error(y_test_inv, y_pred_att_inv)
metrics["attention_r2"] = r2_score(y_test_inv, y_pred_att_inv)

metrics["baseline_rmse"] = rmse(y_test_inv, y_pred_base_inv)
metrics["baseline_mae"] = mean_absolute_error(y_test_inv, y_pred_base_inv)
metrics["baseline_r2"] = r2_score(y_test_inv, y_pred_base_inv)

print("Evaluation metrics:", metrics)
# Save metrics to report and CSV
save_report("\n\n=== Final Evaluation Metrics (on held-out test set) ===")
save_report("Metric,Attention-LSTM,Baseline-LSTM")
save_report(f"RMSE,{metrics['attention_rmse']:.6f},{metrics['baseline_rmse']:.6f}")
save_report(f"MAE,{metrics['attention_mae']:.6f},{metrics['baseline_mae']:.6f}")
save_report(f"R2,{metrics['attention_r2']:.6f},{metrics['baseline_r2']:.6f}")

# Save predictions CSVs
times = test_df_final["time"].iloc[LOOKBACK:].reset_index(drop=True)
out_att = pd.DataFrame({
    "time": times,
    "y_true": y_test_inv,
    "pred_attention": y_pred_att_inv
})
out_base = pd.DataFrame({
    "time": times,
    "y_true": y_test_inv,
    "pred_baseline": y_pred_base_inv
})
out_att.to_csv(os.path.join(OUT_DIR, "predictions_attention.csv"), index=False)
out_base.to_csv(os.path.join(OUT_DIR, "predictions_baseline.csv"), index=False)
save_report(f"\nSaved predictions_attention.csv and predictions_baseline.csv to {OUT_DIR}")

# -------------------- Visualize attention weights --------------------
# To obtain attention scores from the MultiHeadAttention layer, we need to call it directly with return_attention_scores=True.
# But our model's attention layer was created inside build_attention_lstm_model; we saved 'mha' object as model.mha_layer previously only for tuner model.
# We built att_model via build_attention_lstm_model that created an mha layer with name "mha".
# We'll create a small submodel that outputs the LSTM outputs before MHA and then call the MHA layer directly to get scores.
layer_names = [layer.name for layer in att_model.layers]
print("Model layers:", layer_names)

# Determine the sequence output layer based on hyperparameters
if best_hp.get("use_second_lstm"):
    seq_layer = "lstm_2"
else:
    seq_layer = "lstm_1"

# Create extractor model: inputs -> LSTM outputs (sequence)
extractor = keras.Model(inputs=att_model.input, outputs=att_model.get_layer(seq_layer).output)

# Find mha layer in model
mha_layer = None
for layer in att_model.layers:
    if isinstance(layer, layers.MultiHeadAttention) or layer.name == "mha":
        mha_layer = layer
        break
if mha_layer is None:
    raise RuntimeError("MultiHeadAttention layer not found in trained model.")

# Choose a few test examples to visualize
num_examples = min(6, X_test.shape[0])
example_idx = list(range(num_examples))  # first few
# extract sequences for examples
X_examples = X_test[example_idx]

# Compute sequence outputs then call mha with return_attention_scores=True
seq_outputs = extractor.predict(X_examples, batch_size=BATCH_SIZE)  # shape (num_examples, lookback, units)
# Call mha: query=seq_outputs, value=seq_outputs, key=seq_outputs, return_attention_scores=True
# Note: calling a layer directly will create new variables in graph if shapes mismatch; use the layer's call method.
att_output, att_scores = mha_layer(query=seq_outputs, value=seq_outputs, key=seq_outputs, return_attention_scores=True)
# att_scores shape: (batch, num_heads, query_seq_len, key_seq_len)
# We will average across heads for visualization
att_scores_avg = np.mean(att_scores, axis=1)  # shape (batch, query_len, key_len)

# Plot heatmaps
sns.set()
for i in range(num_examples):
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(att_scores_avg[i], xticklabels=np.arange(-LOOKBACK+1,1), yticklabels=np.arange(-LOOKBACK+1,1),
                cmap="viridis", ax=ax)
    ax.set_title(f"Attention weights (avg heads) example idx {i}")
    ax.set_xlabel("Key timestep (relative index)")
    ax.set_ylabel("Query timestep (relative index)")
    img_path = os.path.join(OUT_DIR, f"att_heatmap_{i}.png")
    fig.savefig(img_path, bbox_inches="tight")
    plt.close(fig)
    save_report(f"Saved attention heatmap: {img_path}")

# -------------------- Analyze attention patterns (text) --------------------
analysis_text = "\n\n=== Attention analysis ===\n"
analysis_text += f"Displayed {num_examples} attention heatmaps (avg across {mha_layer.num_heads} heads).\n"
analysis_text += "Rows = query positions (timesteps where model attends from); Columns = key positions (historical timesteps model attends to).\n"
analysis_text += "Values are attention weights averaged across heads. Values close to 1 indicate high focus on that historical position for the given query.\n"
analysis_text += "Interpretation guidance:\n"
analysis_text += " - Diagonal dominance indicates the model mostly attends close-by timesteps.\n"
analysis_text += " - Off-diagonal peaks indicate the model references older timesteps (seasonality) or abrupt shifts.\n"
analysis_text += " - Compare heatmaps to dataset features/time ranges to see if attention aligns with seasonal lags or abrupt jump points.\n"
save_report(analysis_text)

# -------------------- Final outputs summary --------------------
end_time = datetime.utcnow().isoformat()
summary = f"\n\nPipeline finished at {end_time} UTC.\nOutputs saved to {OUT_DIR}:\n - predictions_attention.csv\n - predictions_baseline.csv\n - attention_lstm_trained.h5\n - baseline_lstm.h5\n - att_heatmap_*.png\n - report.txt\n"
print(summary)
save_report(summary)


Trial 25 Complete [00h 00m 09s]
val_loss: 0.07886311411857605

Best val_loss So Far: 0.049884259700775146
Total elapsed time: 00h 04m 54s


  saveable.load_own_variables(weights_store.get(inner_path))


Final shapes -> X_train: (776, 24, 3) X_test: (200, 24, 3)


Epoch 1/40
22/22 - 4s - 199ms/step - loss: 0.7870 - mae: 0.6660 - val_loss: 0.5623 - val_mae: 0.6348
Epoch 2/40
22/22 - 0s - 13ms/step - loss: 0.3479 - mae: 0.4644 - val_loss: 0.1698 - val_mae: 0.3575
Epoch 3/40
22/22 - 0s - 12ms/step - loss: 0.2349 - mae: 0.3840 - val_loss: 0.0854 - val_mae: 0.2424
Epoch 4/40
22/22 - 0s - 12ms/step - loss: 0.1953 - mae: 0.3507 - val_loss: 0.0955 - val_mae: 0.2316
Epoch 5/40
22/22 - 0s - 12ms/step - loss: 0.2010 - mae: 0.3482 - val_loss: 0.0662 - val_mae: 0.2001
Epoch 6/40
22/22 - 0s - 13ms/step - loss: 0.1744 - mae: 0.3268 - val_loss: 0.0950 - val_mae: 0.2517
Epoch 7/40
22/22 - 0s - 12ms/step - loss: 0.1676 - mae: 0.3203 - val_loss: 0.1223 - val_mae: 0.2835
Epoch 8/40
22/22 - 0s - 12ms/step - loss: 0.1513 - mae: 0.3062 - val_loss: 0.0785 - val_mae: 0.2281
Epoch 9/40
22/22 - 0s - 12ms/step - loss: 0.1480 - mae: 0.3038 - val_loss: 0.1067 - val_mae: 0.2677
Epoch 10/40
22/22 - 0s - 13ms/step - loss: 0.1361 - mae: 0.2936 - val_loss: 0.1913 - val_mae: 0.358



Epoch 1/40
22/22 - 2s - 74ms/step - loss: 0.3287 - mae: 0.4438 - val_loss: 0.1509 - val_mae: 0.3161
Epoch 2/40
22/22 - 0s - 9ms/step - loss: 0.1133 - mae: 0.2691 - val_loss: 0.0899 - val_mae: 0.2408
Epoch 3/40
22/22 - 0s - 13ms/step - loss: 0.0872 - mae: 0.2364 - val_loss: 0.0588 - val_mae: 0.1972
Epoch 4/40
22/22 - 0s - 9ms/step - loss: 0.0759 - mae: 0.2232 - val_loss: 0.0575 - val_mae: 0.1896
Epoch 5/40
22/22 - 0s - 9ms/step - loss: 0.0732 - mae: 0.2180 - val_loss: 0.0575 - val_mae: 0.1844
Epoch 6/40
22/22 - 0s - 13ms/step - loss: 0.0743 - mae: 0.2190 - val_loss: 0.0585 - val_mae: 0.1903
Epoch 7/40
22/22 - 0s - 9ms/step - loss: 0.0717 - mae: 0.2166 - val_loss: 0.0597 - val_mae: 0.1951
Epoch 8/40
22/22 - 0s - 9ms/step - loss: 0.0713 - mae: 0.2152 - val_loss: 0.0579 - val_mae: 0.1918
Epoch 9/40
22/22 - 0s - 8ms/step - loss: 0.0678 - mae: 0.2114 - val_loss: 0.0621 - val_mae: 0.2025
Epoch 10/40
22/22 - 0s - 9ms/step - loss: 0.0664 - mae: 0.2076 - val_loss: 0.0594 - val_mae: 0.1954




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Evaluation metrics: {'attention_rmse': np.float64(2.8022608678015724), 'attention_mae': 2.3888521617375584, 'attention_r2': 0.1399973478788371, 'baseline_rmse': np.float64(1.7144665868351214), 'baseline_mae': 1.42129190401299, 'baseline_r2': 0.678085367766507}
Model layers: ['inputs', 'lstm_1', 'lstm_2', 'dropout', 'mha', 'res_add', 'att_layernorm', 'last_timestep', 'dense', 'dropout_3', 'output']




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step


Pipeline finished at 2025-11-24T08:51:27.215089 UTC.
Outputs saved to /mnt/data:
 - predictions_attention.csv
 - predictions_baseline.csv
 - attention_lstm_trained.h5
 - baseline_lstm.h5
 - att_heatmap_*.png
 - report.txt



  end_time = datetime.utcnow().isoformat()
