<a href="https://colab.research.google.com/github/Leslyndizeye/Time-Series-Forecasting/blob/main/air_quality_forecasting_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### IMPORT LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import time
from datetime import datetime

# TensorFlow imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (LSTM, Dense, Dropout, Bidirectional, GRU,
                                   Conv1D, MaxPooling1D, Flatten, Input,
                                   Concatenate, Attention, MultiHeadAttention,
                                   LayerNormalization, GlobalAveragePooling1D)
from tensorflow.keras.optimizers import Adam, RMSprop, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.regularizers import l1_l2

# Scikit-learn imports
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit

# Configuration
plt.style.use('default')
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.random.set_seed(42)

DATA LOADING & ENHANCED PREPROCESSING

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Load datasets
train = pd.read_csv('/content/drive/My Drive/air_quality/data/train.csv')
test = pd.read_csv('/content/drive/My Drive/air_quality/data/test.csv')

print(" Data Loading Complete:")
print(f"Training Data Shape: {train.shape}")
print(f"Test Data Shape: {test.shape}")

# Enhanced feature engineering with more advanced features
def create_advanced_features_v2(df, is_training=True):
    """Enhanced feature engineering with more sophisticated features"""
    df = df.copy()
    df['datetime'] = pd.to_datetime(df['datetime'])

    # Extended time features
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['month'] = df['datetime'].dt.month
    df['quarter'] = df['datetime'].dt.quarter
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 6)).astype(int)
    df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 16) & (df['hour'] <= 18))
    df['is_rush_hour'] = df['is_rush_hour'].astype(int)

    # Advanced cyclical encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # Advanced weather interactions
    df['temp_dew_diff'] = df['TEMP'] - df['DEWP']
    df['wind_pressure'] = df['Iws'] * df['PRES']
    df['temp_humidity_index'] = df['TEMP'] * df['DEWP'] / 100
    df['wind_temp_ratio'] = df['Iws'] / (df['TEMP'] + 1e-6)
    df['pressure_normalized'] = (df['PRES'] - df['PRES'].mean()) / df['PRES'].std()

    # Polynomial features
    df['temp_squared'] = df['TEMP'] ** 2
    df['dew_squared'] = df['DEWP'] ** 2
    df['wind_squared'] = df['Iws'] ** 2

    # Only create PM2.5 based features for training data
    if is_training and 'pm2.5' in df.columns:
        # Advanced rolling statistics
        windows = [3, 6, 12, 24, 48]
        for window in windows:
            df[f'pm2.5_roll_mean_{window}'] = df['pm2.5'].rolling(window=window, min_periods=1).mean()
            df[f'pm2.5_roll_std_{window}'] = df['pm2.5'].rolling(window=window, min_periods=1).std()
            df[f'pm2.5_roll_min_{window}'] = df['pm2.5'].rolling(window=window, min_periods=1).min()
            df[f'pm2.5_roll_max_{window}'] = df['pm2.5'].rolling(window=window, min_periods=1).max()
            df[f'pm2.5_roll_range_{window}'] = df[f'pm2.5_roll_max_{window}'] - df[f'pm2.5_roll_min_{window}']

        # Extended lag features
        lags = [1, 2, 3, 6, 12, 24, 48]
        for lag in lags:
            df[f'pm2.5_lag_{lag}'] = df['pm2.5'].shift(lag)

        # Trend features
        df['pm2.5_trend_1h'] = df['pm2.5'] - df['pm2.5'].shift(1)
        df['pm2.5_trend_3h'] = df['pm2.5'] - df['pm2.5'].shift(3)
        df['pm2.5_trend_6h'] = df['pm2.5'] - df['pm2.5'].shift(6)
        df['pm2.5_trend_12h'] = df['pm2.5'] - df['pm2.5'].shift(12)

        # Momentum features
        df['pm2.5_momentum_3h'] = df['pm2.5_trend_1h'].rolling(window=3).mean()
        df['pm2.5_momentum_6h'] = df['pm2.5_trend_1h'].rolling(window=6).mean()

    # Weather lag features for both train and test
    weather_cols = ['TEMP', 'DEWP', 'PRES', 'Iws', 'Is', 'Ir']
    for col in weather_cols:
        if col in df.columns:
            for lag in [1, 3, 6, 12]:
                df[f'{col}_lag_{lag}'] = df[col].shift(lag)

    # Interaction features between weather variables
    df['temp_dew_pressure'] = df['TEMP'] * df['DEWP'] * df['PRES']
    df['wind_is_interaction'] = df['Iws'] * df['Is']
    df['wind_ir_interaction'] = df['Iws'] * df['Ir']

    return df

# Apply enhanced feature engineering
train_filled = train.ffill().bfill().fillna(train.mean(numeric_only=True))
test_filled = test.ffill().bfill().fillna(test.mean(numeric_only=True))

print(" Creating advanced features v2...")
train_enhanced = create_advanced_features_v2(train_filled, is_training=True)
test_enhanced = create_advanced_features_v2(test_filled, is_training=False)

# Handle NaN values
train_enhanced = train_enhanced.ffill().bfill().fillna(0)
test_enhanced = test_enhanced.ffill().bfill().fillna(0)

print(f" Enhanced features - Train: {train_enhanced.shape}, Test: {test_enhanced.shape}")

# Prepare features and target
X_train = train_enhanced.drop(['pm2.5', 'No', 'datetime'], axis=1, errors='ignore')
y_train = train_enhanced['pm2.5']
X_test = test_enhanced.drop(['No', 'datetime'], axis=1, errors='ignore')

# Ensure both have the same columns
common_cols = list(set(X_train.columns) & set(X_test.columns))
X_train = X_train[common_cols]
X_test = X_test[common_cols]

print(f" Common features: {len(common_cols)}")

# Feature selection with more features
selector = SelectKBest(score_func=f_regression, k=min(40, len(common_cols)))
X_train_selected = selector.fit_transform(X_train, y_train)
selected_mask = selector.get_support()
selected_features = X_train.columns[selected_mask]

X_train = X_train[selected_features]
X_test = X_test[selected_features]

print(f" Selected {len(selected_features)} best features")

# Scale features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Mounted at /content/drive
 Data Loading Complete:
Training Data Shape: (30676, 12)
Test Data Shape: (13148, 11)
 Creating advanced features v2...
 Enhanced features - Train: (30676, 99), Test: (13148, 60)
 Common features: 58
 Selected 40 best features


### MODEL ARCHITECTURES

In [4]:
# Create sequences for LSTM
def create_sequences(X, y, time_steps=48):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 48  # Longer sequences for better temporal patterns
X_seq, y_seq = create_sequences(X_train_scaled, y_train.values, TIME_STEPS)

# Split data
split_idx = int(0.8 * len(X_seq))
X_train_seq, X_val_seq = X_seq[:split_idx], X_seq[split_idx:]
y_train_seq, y_val_seq = y_seq[:split_idx], y_seq[split_idx:]

print(f" Sequential data - Train: {X_train_seq.shape}, Val: {X_val_seq.shape}")

# 1. CNN-LSTM Hybrid Model
def create_cnn_lstm_model(input_shape):
    """CNN-LSTM hybrid architecture"""
    model = Sequential([
        # CNN layers for feature extraction
        Conv1D(filters=64, kernel_size=3, activation='relu',
               input_shape=input_shape, padding='same'),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
        MaxPooling1D(pool_size=2),

        # LSTM layers for sequence processing
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.3),
        Bidirectional(LSTM(32)),
        Dropout(0.3),

        # Dense layers
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)
    ])

    model.compile(optimizer=Adam(learning_rate=0.0005),
                  loss='mse',
                  metrics=['mae'])
    return model

# 2. Transformer-inspired Architecture
def create_transformer_model(input_shape):
    """Transformer-inspired architecture for time series"""
    inputs = Input(shape=input_shape)

    # Positional encoding (simplified)
    x = LayerNormalization()(inputs)

    # Multi-head self-attention
    attention_output = MultiHeadAttention(num_heads=4, key_dim=input_shape[1])(x, x)
    x = tf.keras.layers.Add()([x, attention_output])
    x = LayerNormalization()(x)

    # Feed-forward network
    ff_output = Dense(64, activation='relu')(x)
    ff_output = Dense(input_shape[1])(ff_output)
    x = tf.keras.layers.Add()([x, ff_output])
    x = LayerNormalization()(x)

    # Global pooling and output
    x = GlobalAveragePooling1D()(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1)(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.0005),
                  loss='mse',
                  metrics=['mae'])
    return model

# 3. GRU with Attention
def create_gru_attention_model(input_shape):
    """GRU with attention mechanism"""
    inputs = Input(shape=input_shape)

    # GRU layers
    x = Bidirectional(GRU(64, return_sequences=True))(inputs)
    x = Dropout(0.3)(x)
    x = Bidirectional(GRU(32, return_sequences=True))(x)

    # Attention mechanism
    attention = Attention()([x, x])
    x = tf.keras.layers.Concatenate()([x, attention])
    x = GlobalAveragePooling1D()(x)

    # Output layers
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(32, activation='relu')(x)
    outputs = Dense(1)(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.0005),
                  loss='mse',
                  metrics=['mae'])
    return model

# 4. Enhanced Bidirectional LSTM (baseline for comparison)
def create_enhanced_lstm_model(input_shape):
    """Enhanced LSTM architecture"""
    model = Sequential([
        Bidirectional(LSTM(128, return_sequences=True,
                          kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4),
                          input_shape=input_shape)),
        Dropout(0.3),
        Bidirectional(LSTM(64, return_sequences=True,
                          kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))),
        Dropout(0.3),
        Bidirectional(LSTM(32)),
        Dropout(0.2),
        Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)
    ])

    model.compile(optimizer=Adam(learning_rate=0.0005),
                  loss='mse',
                  metrics=['mae'])
    return model

# Create all models
models = {
    'CNN-LSTM': create_cnn_lstm_model((X_train_seq.shape[1], X_train_seq.shape[2])),
    'Transformer': create_transformer_model((X_train_seq.shape[1], X_train_seq.shape[2])),
    'GRU-Attention': create_gru_attention_model((X_train_seq.shape[1], X_train_seq.shape[2])),
    'Enhanced-LSTM': create_enhanced_lstm_model((X_train_seq.shape[1], X_train_seq.shape[2]))
}

 Sequential data - Train: (24502, 48, 40), Val: (6126, 48, 40)


 MODEL TRAINING & ENSEMBLE

In [5]:
model_predictions = {}
model_performance = {}
trained_models = {}

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6),
    ModelCheckpoint('/content/best_model.h5', monitor='val_loss', save_best_only=True)
]

# Train each model
for name, model in models.items():
    print(f"\n Training {name} model...")
    start_time = time.time()

    history = model.fit(
        X_train_seq, y_train_seq,
        validation_data=(X_val_seq, y_val_seq),
        epochs=20,
        batch_size=64,
        callbacks=callbacks,
        verbose=1
    )

    # Evaluate
    val_pred = model.predict(X_val_seq, verbose=0)
    val_rmse = np.sqrt(mean_squared_error(y_val_seq, val_pred))
    val_mae = mean_absolute_error(y_val_seq, val_pred)

    training_time = time.time() - start_time

    model_performance[name] = {
        'rmse': val_rmse,
        'mae': val_mae,
        'time': training_time
    }

    trained_models[name] = model
    model_predictions[name] = val_pred

    print(f" {name} - RMSE: {val_rmse:.2f}, MAE: {val_mae:.2f}, Time: {training_time:.1f}s")

# Display performance comparison
print("\n MODEL PERFORMANCE COMPARISON:")
performance_df = pd.DataFrame(model_performance).T
print(performance_df.sort_values('rmse'))


 Training CNN-LSTM model...
Epoch 1/40
[1m382/383[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 65ms/step - loss: 12422.4883 - mae: 78.9233



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 74ms/step - loss: 12406.5244 - mae: 78.8720 - val_loss: 8423.0898 - val_mae: 63.1609 - learning_rate: 5.0000e-04
Epoch 2/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 4955.7271 - mae: 49.3647



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 74ms/step - loss: 4954.5083 - mae: 49.3573 - val_loss: 5856.2446 - val_mae: 51.6159 - learning_rate: 5.0000e-04
Epoch 3/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - loss: 3913.7556 - mae: 42.3369



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 76ms/step - loss: 3913.1609 - mae: 42.3335 - val_loss: 5700.0288 - val_mae: 48.4007 - learning_rate: 5.0000e-04
Epoch 4/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 3351.4199 - mae: 39.0266



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 73ms/step - loss: 3350.9895 - mae: 39.0243 - val_loss: 5516.6758 - val_mae: 47.9236 - learning_rate: 5.0000e-04
Epoch 5/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 73ms/step - loss: 2935.0027 - mae: 36.4711 - val_loss: 5617.8784 - val_mae: 49.9121 - learning_rate: 5.0000e-04
Epoch 6/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 73ms/step - loss: 2639.1487 - mae: 34.8478 - val_loss: 5630.6841 - val_mae: 49.3024 - learning_rate: 5.0000e-04
Epoch 7/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 73ms/step - loss: 2446.8313 - mae: 33.4321 - val_loss: 5718.6216 - val_mae: 50.0917 - learning_rate: 5.0000e-04
Epoch 8/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 73ms/step - loss: 2217.5967 - mae: 31.9911 - val_loss: 5602.4956 - val_mae: 49.3787 - learning_rate: 5.0000e-04
Epoch 9/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 70ms/step - loss: 1745.0286 - mae: 28.5455 - val_loss: 5413.1777 - val_mae: 48.2280 - learning_rate: 2.5000e-04
Epoch 11/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 70ms/step - loss: 1643.4750 - mae: 27.8585 - val_loss: 5475.1699 - val_mae: 48.9841 - learning_rate: 2.5000e-04
Epoch 12/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 75ms/step - loss: 1593.7812 - mae: 27.0891 - val_loss: 5669.4795 - val_mae: 49.7336 - learning_rate: 2.5000e-04
Epoch 13/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 70ms/step - loss: 1498.4912 - mae: 26.4464 - val_loss: 5541.4834 - val_mae: 48.9130 - learning_rate: 2.5000e-04
Epoch 14/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 68ms/step - loss: 1406.3370 - mae: 25.8179 - val_loss: 5577.5278 - val_mae: 49.1341 - learning_rate: 2.5000e-04
Epoch 15/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 157ms/step - loss: 3288.5425 - mae: 38.7926 - val_loss: 5312.2847 - val_mae: 48.4452 - learning_rate: 5.0000e-04
Epoch 7/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 155ms/step - loss: 3089.5129 - mae: 37.3938 - val_loss: 5675.5918 - val_mae: 48.6585 - learning_rate: 5.0000e-04
Epoch 8/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - loss: 2974.2092 - mae: 36.5208



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 161ms/step - loss: 2974.0786 - mae: 36.5202 - val_loss: 5231.9746 - val_mae: 47.4516 - learning_rate: 5.0000e-04
Epoch 9/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 155ms/step - loss: 2824.5845 - mae: 35.5273 - val_loss: 5337.0151 - val_mae: 47.3150 - learning_rate: 5.0000e-04
Epoch 10/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - loss: 2804.5728 - mae: 35.3106



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 161ms/step - loss: 2804.4832 - mae: 35.3100 - val_loss: 5167.7896 - val_mae: 46.2713 - learning_rate: 5.0000e-04
Epoch 11/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - loss: 2640.3362 - mae: 34.4620



[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 161ms/step - loss: 2640.2329 - mae: 34.4613 - val_loss: 5164.2383 - val_mae: 46.4017 - learning_rate: 5.0000e-04
Epoch 12/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 155ms/step - loss: 2537.9465 - mae: 33.7830 - val_loss: 5301.7139 - val_mae: 46.2964 - learning_rate: 5.0000e-04
Epoch 13/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 155ms/step - loss: 2449.6648 - mae: 33.0812 - val_loss: 5211.3599 - val_mae: 45.4903 - learning_rate: 5.0000e-04
Epoch 14/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 161ms/step - loss: 2301.2874 - mae: 32.1203 - val_loss: 5263.5850 - val_mae: 45.9777 - learning_rate: 5.0000e-04
Epoch 15/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 156ms/step - loss: 2236.5735 - mae: 31.7276 - val_loss: 5172.9844 - val_mae: 45.5823 - learning_rate: 5.0000e-04
Epoch 16/40
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━

ENSEMBLE METHODS

In [7]:
# 1. Simple Average Ensemble
val_predictions = np.array(list(model_predictions.values()))
average_ensemble_pred = np.mean(val_predictions, axis=0)
ensemble_rmse = np.sqrt(mean_squared_error(y_val_seq, average_ensemble_pred))
ensemble_mae = mean_absolute_error(y_val_seq, average_ensemble_pred)

print(f" Average Ensemble - RMSE: {ensemble_rmse:.2f}, MAE: {ensemble_mae:.2f}")

# 2. Weighted Average Ensemble (weight by performance)
weights = 1.0 / np.array([perf['rmse'] for perf in model_performance.values()])
weights /= weights.sum()
weighted_ensemble_pred = np.average(val_predictions, axis=0, weights=weights)
weighted_rmse = np.sqrt(mean_squared_error(y_val_seq, weighted_ensemble_pred))
weighted_mae = mean_absolute_error(y_val_seq, weighted_ensemble_pred)

print(f" Weighted Ensemble - RMSE: {weighted_rmse:.2f}, MAE: {weighted_mae:.2f}")

# 3. Stacking Ensemble with Meta-Learner
# Use model predictions as features for meta-learner
stacking_features = np.column_stack(list(model_predictions.values()))

# Train meta-learner (Gradient Boosting)
meta_learner = GradientBoostingRegressor(n_estimators=100, random_state=42)
meta_learner.fit(stacking_features, y_val_seq)

# Predict with meta-learner
stacking_pred = meta_learner.predict(stacking_features)
stacking_rmse = np.sqrt(mean_squared_error(y_val_seq, stacking_pred))
stacking_mae = mean_absolute_error(y_val_seq, stacking_pred)

print(f" Stacking Ensemble - RMSE: {stacking_rmse:.2f}, MAE: {stacking_mae:.2f}")

# Choose best ensemble method
ensemble_results = {
    'Average': ensemble_rmse,
    'Weighted': weighted_rmse,
    'Stacking': stacking_rmse
}

best_ensemble_method = min(ensemble_results, key=ensemble_results.get)
print(f"\n Best Ensemble Method: {best_ensemble_method} (RMSE: {ensemble_results[best_ensemble_method]:.2f})")

 Average Ensemble - RMSE: 70.99, MAE: 47.06
 Weighted Ensemble - RMSE: 70.75, MAE: 46.79
 Stacking Ensemble - RMSE: 58.31, MAE: 39.49

 Best Ensemble Method: Stacking (RMSE: 58.31)


FINAL PREDICTION & SUBMISSION

In [8]:
X_test_seq = []
current_sequence = X_train_scaled[-TIME_STEPS:].copy()

for i in range(len(X_test_scaled)):
    current_sequence = np.roll(current_sequence, -1, axis=0)
    current_sequence[-1] = X_test_scaled[i]
    X_test_seq.append(current_sequence.copy())

X_test_seq = np.array(X_test_seq)
print(f"✅ Test sequences shape: {X_test_seq.shape}")

# Generate predictions from all models
test_predictions_all = {}
for name, model in trained_models.items():
    test_predictions_all[name] = model.predict(X_test_seq, verbose=0).flatten()
    print(f" {name} predictions generated")

# Create ensemble prediction
if best_ensemble_method == 'Average':
    ensemble_test_pred = np.mean(list(test_predictions_all.values()), axis=0)
elif best_ensemble_method == 'Weighted':
    ensemble_test_pred = np.average(list(test_predictions_all.values()), axis=0, weights=weights)
else:  # Stacking
    stacking_test_features = np.column_stack(list(test_predictions_all.values()))
    ensemble_test_pred = meta_learner.predict(stacking_test_features)

# Ensure correct length
test_original = pd.read_csv('/content/drive/My Drive/air_quality/data/test.csv')
if len(ensemble_test_pred) > len(test_original):
    ensemble_test_pred = ensemble_test_pred[:len(test_original)]
elif len(ensemble_test_pred) < len(test_original):
    last_pred = ensemble_test_pred[-1] if len(ensemble_test_pred) > 0 else 0
    pad_needed = len(test_original) - len(ensemble_test_pred)
    ensemble_test_pred = np.append(ensemble_test_pred, [last_pred] * pad_needed)

# Create submission with proper format
def remove_leading_zeros(dt_str):
    """Remove leading zeros from datetime string"""
    if ' ' in str(dt_str) and ':' in str(dt_str):
        date_part, time_part = str(dt_str).split(' ')
        time_parts = time_part.split(':')
        if time_parts[0].startswith('0') and len(time_parts[0]) == 2:
            time_parts[0] = time_parts[0][1]
        return f"{date_part} {':'.join(time_parts)}"
    return str(dt_str)

formatted_dates = [remove_leading_zeros(dt) for dt in test_original['datetime']]

submission = pd.DataFrame({
    'row ID': formatted_dates,
    'pm2.5': np.clip(ensemble_test_pred, 0, None).astype(int)
})

# Save submission
save_dir = '/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forcasting'
os.makedirs(save_dir, exist_ok=True)
submission_file = os.path.join(save_dir, 'submission_ensemble.csv')
submission.to_csv(submission_file, index=False)

print(f" Ensemble submission saved: {submission_file}")

# Verify submission
print("\n Submission verification:")
print(f"Shape: {submission.shape}")
print("First 5 rows:")
for i in range(5):
    print(f"  {submission['row ID'].iloc[i]}, {submission['pm2.5'].iloc[i]}")

✅ Test sequences shape: (13148, 48, 40)
✅ CNN-LSTM predictions generated
✅ Transformer predictions generated
✅ GRU-Attention predictions generated
✅ Enhanced-LSTM predictions generated
✅ Ensemble submission saved: /content/drive/MyDrive/Kaggle_competition_ML/air_quality_forcasting/submission_ensemble.csv

🔍 Submission verification:
Shape: (13148, 2)
First 5 rows:
  2013-07-02 4:00:00, 30
  2013-07-02 5:00:00, 30
  2013-07-02 6:00:00, 30
  2013-07-02 7:00:00, 30
  2013-07-02 8:00:00, 30


 SECTION 7: COMPREHENSIVE ANALYSIS

In [10]:
print(" FINAL RESULTS SUMMARY:")
print(f"Best Individual Model: {performance_df['rmse'].idxmin()} (RMSE: {performance_df['rmse'].min():.2f})")
print(f"Best Ensemble Method: {best_ensemble_method} (RMSE: {ensemble_results[best_ensemble_method]:.2f})")

print("\n MODEL PERFORMANCE RANKING:")
performance_df = performance_df.sort_values('rmse')
print(performance_df)

print("\n ENSEMBLE PERFORMANCE:")
for method, rmse in ensemble_results.items():
    improvement = ((performance_df['rmse'].iloc[0] - rmse) / performance_df['rmse'].iloc[0]) * 100
    print(f"  {method}: RMSE {rmse:.2f} ({improvement:+.1f}% vs best individual)")

 FINAL RESULTS SUMMARY:
Best Individual Model: GRU-Attention (RMSE: 71.86)
Best Ensemble Method: Stacking (RMSE: 58.31)

 MODEL PERFORMANCE RANKING:
                    rmse        mae         time
GRU-Attention  71.862637  46.401725  1505.983170
CNN-LSTM       73.574297  48.228032   716.165731
Enhanced-LSTM  74.753373  48.676166  2536.789336
Transformer    86.490473  59.422206   606.348080

 ENSEMBLE PERFORMANCE:
  Average: RMSE 70.99 (+1.2% vs best individual)
  Weighted: RMSE 70.75 (+1.5% vs best individual)
  Stacking: RMSE 58.31 (+18.9% vs best individual)
