<a href="https://colab.research.google.com/github/Gunnalakshmi/Calculator/blob/main/Final_Project_ANNSIH_(Short_term_forecast_of_gaseous_air_pollutants_(ground_level_O3_and_NO2)_using_satellite_and_reanalysis_data).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Dropout, MultiHeadAttention, Add, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [None]:
site_files = ["site_1_train_data.csv", "site_2_train_data.csv", "site_3_train_data.csv"]

dfs = {}
train_dfs = {}
test_dfs = {}
scaler_xs = {}
scaler_ys = {}
X_train_seqs = {}
y_train_seqs = {}
X_test_seqs = {}
y_test_seqs = {}
results = {}
time_steps = 72

In [None]:
for file in site_files:
    df = pd.read_csv(file)
    dfs[file] = df.copy()
    print(file, "initial shape:", df.shape)

site_1_train_data.csv initial shape: (25081, 16)
site_2_train_data.csv initial shape: (25969, 16)
site_3_train_data.csv initial shape: (21913, 16)


In [None]:
drop_cols = ['NO2_satellite', 'HCHO_satellite', 'ratio_satellite']
for file, df in dfs.items():
    for col in drop_cols:
        if col in df.columns:
            df.drop(columns=col, inplace=True)
    dfs[file] = df
    print(file, "after dropping satellite cols:", df.shape)

site_1_train_data.csv after dropping satellite cols: (25081, 13)
site_2_train_data.csv after dropping satellite cols: (25969, 13)
site_3_train_data.csv after dropping satellite cols: (21913, 13)


In [None]:
for file, df in dfs.items():
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
    df = df.sort_values('datetime').reset_index(drop=True)
    dfs[file] = df
    print(file, "after datetime sort:", df.shape)

site_1_train_data.csv after datetime sort: (25081, 14)
site_2_train_data.csv after datetime sort: (25969, 14)
site_3_train_data.csv after datetime sort: (21913, 14)


In [None]:
for file, df in dfs.items():
    df.interpolate(method='linear', limit_direction='both', inplace=True)
    df.dropna(inplace=True)
    dfs[file] = df
    print(file, "after interpolation & dropna:", df.shape)

site_1_train_data.csv after interpolation & dropna: (25081, 14)
site_2_train_data.csv after interpolation & dropna: (25969, 14)
site_3_train_data.csv after interpolation & dropna: (21913, 14)


In [None]:
for file, df in dfs.items():
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['O3_diff'] = df['O3_target'] - df['O3_target'].shift(1)
    df['NO2_diff'] = df['NO2_target'] - df['NO2_target'].shift(1)
    df.fillna(0, inplace=True)
    dfs[file] = df

In [None]:
for file, df in dfs.items():
    for pollutant in ['O3_target', 'NO2_target']:
        for lag in range(1, 73):
            df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
    df.dropna(inplace=True)
    dfs[file] = df

  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollutant}_lag_{lag}'] = df[pollutant].shift(lag)
  df[f'{pollut

In [None]:
time_steps = 72

for file, df in dfs.items():
    input_features = [
        'O3_forecast', 'NO2_forecast', 'T_forecast', 'q_forecast',
        'u_forecast', 'v_forecast', 'w_forecast',
        'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
        'O3_diff', 'NO2_diff'
    ]
    lag_features = [col for col in df.columns if '_lag_' in col]
    input_features.extend(lag_features)
    target_features = ['O3_target', 'NO2_target']

    # Split train/test
    train_df, test_df = train_test_split(df, test_size=0.25, shuffle=False)
    train_dfs[file] = train_df
    test_dfs[file] = test_df

    # Input scaler (one per site)
    scaler_x = StandardScaler()
    X_train = scaler_x.fit_transform(train_df[input_features])
    X_test = scaler_x.transform(test_df[input_features])
    scaler_xs[file] = scaler_x

    # Create sequences for X
    Xs_train, Xs_test = [], []
    for i in range(len(X_train) - time_steps):
        Xs_train.append(X_train[i:(i + time_steps)])
    for i in range(len(X_test) - time_steps):
        Xs_test.append(X_test[i:(i + time_steps)])

    X_train_seq = np.array(Xs_train)
    X_test_seq = np.array(Xs_test)
    X_train_seqs[file] = X_train_seq
    X_test_seqs[file] = X_test_seq

    # Target scalers per pollutant (O3, NO2)
    for pollutant in target_features:
        scaler_y = StandardScaler()
        y_train_full = scaler_y.fit_transform(train_df[[pollutant]])
        y_test_full = scaler_y.transform(test_df[[pollutant]])

        ys_train, ys_test = [], []
        for i in range(len(y_train_full) - time_steps):
            ys_train.append(y_train_full[i + time_steps])
        for i in range(len(y_test_full) - time_steps):
            ys_test.append(y_test_full[i + time_steps])

        y_train_seq = np.array(ys_train)
        y_test_seq = np.array(ys_test)

        # Save the target-specific scaler
        scaler_ys[(file, pollutant)] = scaler_y
        y_train_seqs[(file, pollutant)] = y_train_seq
        y_test_seqs[(file, pollutant)] = y_test_seq

    print(file,
          "Train seq shape:", X_train_seq.shape,
          "Test seq shape:", X_test_seq.shape)


site_1_train_data.csv Train seq shape: (18684, 72, 157) Test seq shape: (6181, 72, 157)
site_2_train_data.csv Train seq shape: (19350, 72, 157) Test seq shape: (6403, 72, 157)
site_3_train_data.csv Train seq shape: (16308, 72, 157) Test seq shape: (5389, 72, 157)


In [None]:
import os
import joblib

os.makedirs("saved_scalers", exist_ok=True)

for file, x_scaler in scaler_xs.items():
    site = file.replace("_train_data.csv", "")
    x_path = f"saved_scalers/{site}_X_scaler.pkl"
    joblib.dump(x_scaler, x_path)
    print("Saved X-scaler:", x_path)

for key, y_scaler in scaler_ys.items():
    if isinstance(key, tuple):
        file, pollutant = key
        site = file.replace("_train_data.csv", "")
        poll_short = pollutant.replace("_target", "")
        y_path = f"saved_scalers/{site}_{poll_short}_Y_scaler.pkl"
        joblib.dump(y_scaler, y_path)
        print(f"Saved Y-scaler for {site} - {poll_short}: {y_path}")


Saved X-scaler: saved_scalers/site_1_X_scaler.pkl
Saved X-scaler: saved_scalers/site_2_X_scaler.pkl
Saved X-scaler: saved_scalers/site_3_X_scaler.pkl
Saved Y-scaler for site_1 - O3: saved_scalers/site_1_O3_Y_scaler.pkl
Saved Y-scaler for site_1 - NO2: saved_scalers/site_1_NO2_Y_scaler.pkl
Saved Y-scaler for site_2 - O3: saved_scalers/site_2_O3_Y_scaler.pkl
Saved Y-scaler for site_2 - NO2: saved_scalers/site_2_NO2_Y_scaler.pkl
Saved Y-scaler for site_3 - O3: saved_scalers/site_3_O3_Y_scaler.pkl
Saved Y-scaler for site_3 - NO2: saved_scalers/site_3_NO2_Y_scaler.pkl


In [None]:
import shutil

shutil.make_archive("saved_scalers_zip", 'zip', "saved_scalers")
print("Zipped → saved_scalers_zip.zip")


Zipped → saved_scalers_zip.zip


In [None]:
for idx, file in enumerate(site_files, start=1):
    X_train_seq = X_train_seqs[file]
    X_test_seq = X_test_seqs[file]
    train_df = train_dfs[file]
    test_df = test_dfs[file]

    for pollutant in ['O3_target', 'NO2_target']:
        print(f"\n\n==================== {file} - {pollutant} ====================")

        scaler_y = StandardScaler()
        scaler_ys[(file, pollutant)] = scaler_y
        y_train_full = scaler_y.fit_transform(train_df[[pollutant]])
        y_test_full = scaler_y.transform(test_df[[pollutant]])

        ys_train = []
        for i in range(len(y_train_full) - time_steps):
            ys_train.append(y_train_full[i + time_steps])
        y_train_seq = np.array(ys_train)

        ys_test = []
        for i in range(len(y_test_full) - time_steps):
            ys_test.append(y_test_full[i + time_steps])
        y_test_seq = np.array(ys_test)

        y_train_seqs[(file, pollutant)] = y_train_seq
        y_test_seqs[(file, pollutant)] = y_test_seq

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)
        ]

        inputs = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
        x = LSTM(128, return_sequences=True)(inputs)
        x = Dropout(0.2)(x)
        x = LSTM(64)(x)
        x = Dropout(0.2)(x)
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(1)(x)
        lstm_model = Model(inputs, outputs)
        lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        print("Training LSTM...")
        lstm_model.fit(X_train_seq, y_train_seq, validation_data=(X_test_seq, y_test_seq), epochs=100, batch_size=64, callbacks=callbacks, verbose=2)
        y_pred_lstm = lstm_model.predict(X_test_seq)

        inputs = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
        x = GRU(128, return_sequences=True)(inputs)
        x = Dropout(0.2)(x)
        x = GRU(64)(x)
        x = Dropout(0.2)(x)
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(1)(x)
        gru_model = Model(inputs, outputs)
        gru_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        print("Training GRU...")
        gru_model.fit(X_train_seq, y_train_seq, validation_data=(X_test_seq, y_test_seq), epochs=100, batch_size=64, callbacks=callbacks, verbose=2)
        y_pred_gru = gru_model.predict(X_test_seq)

        inputs = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
        attn = MultiHeadAttention(num_heads=4, key_dim=64)(inputs, inputs)
        attn = Dropout(0.2)(attn)
        x = Add()([inputs, attn])
        x = LayerNormalization(epsilon=1e-6)(x)
        ffn = Dense(128, activation='relu')(x)
        ffn = Dropout(0.2)(ffn)
        ffn = Dense(X_train_seq.shape[2])(ffn)
        x = Add()([x, ffn])
        x = LayerNormalization(epsilon=1e-6)(x)
        x = GlobalAveragePooling1D()(x)
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(1)(x)
        trans_model = Model(inputs, outputs)
        trans_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        print("Training Transformer...")
        trans_model.fit(X_train_seq, y_train_seq, validation_data=(X_test_seq, y_test_seq), epochs=100, batch_size=64, callbacks=callbacks, verbose=2)
        y_pred_trans = trans_model.predict(X_test_seq)

        y_pred_ensemble_scaled = 0.4 * y_pred_gru + 0.4 * y_pred_lstm + 0.2 * y_pred_trans

        y_test_inv = scaler_y.inverse_transform(y_test_seq.reshape(-1, 1)).reshape(-1)
        y_pred_inv = scaler_y.inverse_transform(y_pred_ensemble_scaled.reshape(-1, 1)).reshape(-1)

        results[(file, pollutant, 'Ensemble')] = {"y_true": y_test_inv, "y_pred": y_pred_inv}

        import os
        site_name = file.replace("_train_data.csv","")   # e.g. "site_1"
        poll_short = pollutant.replace("_target","")     # e.g. "O3" or "NO2"
        save_dir = os.path.join("saved_models", site_name)
        os.makedirs(save_dir, exist_ok=True)

        lstm_path = os.path.join(save_dir, f"{site_name}_{poll_short}_LSTM.keras")
        gru_path  = os.path.join(save_dir, f"{site_name}_{poll_short}_GRU.keras")
        trans_path= os.path.join(save_dir, f"{site_name}_{poll_short}_TRANS.keras")

        lstm_model.save(lstm_path)
        gru_model.save(gru_path)
        trans_model.save(trans_path)

        print(f"\n✔ Saved: {lstm_path}")
        print(f"✔ Saved: {gru_path}")
        print(f"✔ Saved: {trans_path}")
        # --- end save ---



Training LSTM...
Epoch 1/100
292/292 - 10s - 35ms/step - loss: 0.2924 - val_loss: 0.0994 - learning_rate: 1.0000e-03
Epoch 2/100
292/292 - 4s - 12ms/step - loss: 0.1712 - val_loss: 0.0666 - learning_rate: 1.0000e-03
Epoch 3/100
292/292 - 4s - 15ms/step - loss: 0.1438 - val_loss: 0.0514 - learning_rate: 1.0000e-03
Epoch 4/100
292/292 - 4s - 12ms/step - loss: 0.1303 - val_loss: 0.0502 - learning_rate: 1.0000e-03
Epoch 5/100
292/292 - 3s - 12ms/step - loss: 0.1206 - val_loss: 0.0448 - learning_rate: 1.0000e-03
Epoch 6/100
292/292 - 4s - 14ms/step - loss: 0.1200 - val_loss: 0.0444 - learning_rate: 1.0000e-03
Epoch 7/100
292/292 - 4s - 12ms/step - loss: 0.1151 - val_loss: 0.0433 - learning_rate: 1.0000e-03
Epoch 8/100
292/292 - 4s - 12ms/step - loss: 0.1112 - val_loss: 0.0404 - learning_rate: 1.0000e-03
Epoch 9/100
292/292 - 4s - 13ms/step - loss: 0.1064 - val_loss: 0.0523 - learning_rate: 1.0000e-03
Epoch 10/100
292/292 - 4s - 14ms/step - loss: 0.1043 - val_loss: 0.0396 - learning_rate: 

In [None]:
summary_rows = []

for (file, pollutant, model_name), data in results.items():
    y_true_inv = data["y_true"]
    y_pred_inv = data["y_pred"]
    site = file.split("_train_data.csv")[0]

    rmse = np.sqrt(mean_squared_error(y_true_inv, y_pred_inv))
    mae = mean_absolute_error(y_true_inv, y_pred_inv)
    r2 = r2_score(y_true_inv, y_pred_inv)
    obs_mean = y_true_inv.mean()
    numerator = np.sum((y_pred_inv - y_true_inv) ** 2)
    denominator = np.sum((np.abs(y_pred_inv - y_true_inv) + np.abs(y_true_inv - obs_mean)) ** 2)
    ria = 1 - (numerator / denominator) if denominator != 0 else np.nan

    summary_rows.append({
        "Site": site,
        "Model": model_name,
        "Target": pollutant,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2,
        "RIA": ria
    })

summary_df = pd.DataFrame(summary_rows)
print("\n PERFORMANCE SUMMARY (All Sites & Models) ")
print(summary_df.to_string(index=False))

avg_summary = summary_df.groupby(["Site", "Model"])[["RMSE","MAE","R2","RIA"]].mean().reset_index()
print("\nAVERAGE PERFORMANCE PER SITE")
print(avg_summary.to_string(index=False))

best_per_site = avg_summary.loc[avg_summary.groupby("Site")["R2"].idxmax()]
print("\n BEST MODEL PER SITE (By R²) ")
print(best_per_site.to_string(index=False))

overall_best = avg_summary.groupby("Model")[["R2","RIA"]].mean().reset_index().sort_values(by="R2", ascending=False).head(1)
print("\n OVERALL BEST MODEL")
print(overall_best.to_string(index=False))


 PERFORMANCE SUMMARY (All Sites & Models) 
  Site    Model     Target      RMSE      MAE       R2      RIA
site_1 Ensemble  O3_target  6.436671 3.474949 0.935279 0.954354
site_1 Ensemble NO2_target  9.712048 5.852114 0.857821 0.916936
site_2 Ensemble  O3_target  8.110899 5.317921 0.906114 0.937858
site_2 Ensemble NO2_target  8.161874 5.637029 0.851812 0.914250
site_3 Ensemble  O3_target 11.565543 7.867597 0.901306 0.934143
site_3 Ensemble NO2_target  9.632655 6.706702 0.880852 0.925939

AVERAGE PERFORMANCE PER SITE
  Site    Model      RMSE      MAE       R2      RIA
site_1 Ensemble  8.074359 4.663531 0.896550 0.935645
site_2 Ensemble  8.136387 5.477475 0.878963 0.926054
site_3 Ensemble 10.599099 7.287150 0.891079 0.930041

 BEST MODEL PER SITE (By R²) 
  Site    Model      RMSE      MAE       R2      RIA
site_1 Ensemble  8.074359 4.663531 0.896550 0.935645
site_2 Ensemble  8.136387 5.477475 0.878963 0.926054
site_3 Ensemble 10.599099 7.287150 0.891079 0.930041

 OVERALL BEST MODEL
  

In [None]:
import shutil

shutil.make_archive("saved_models", 'zip', "saved_models")

'/content/saved_models.zip'

In [None]:
from google.colab import files
files.download("saved_models.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

all_results_list = []

for (file, pollutant, model_type), data in results.items():
    y_true = data["y_true"]
    y_pred = data["y_pred"]

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2  = r2_score(y_true, y_pred)

    site_name = file.replace("_train_data.csv", "")
    pollutant_short = pollutant.replace("_target", "")

    all_results_list.append({
        "Site": site_name,
        "Pollutant": pollutant_short,
        "Model": model_type,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    })

summary_df = pd.DataFrame(all_results_list)
summary_df


Unnamed: 0,Site,Pollutant,Model,RMSE,MAE,R2
0,site_1,O3,Ensemble,6.436671,3.474949,0.935279
1,site_1,NO2,Ensemble,9.712048,5.852114,0.857821
2,site_2,O3,Ensemble,8.110899,5.317921,0.906114
3,site_2,NO2,Ensemble,8.161874,5.637029,0.851812
4,site_3,O3,Ensemble,11.565543,7.867597,0.901306
5,site_3,NO2,Ensemble,9.632655,6.706702,0.880852


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import load_model
import joblib

final_results = []

for file in site_files:
    site_name = file.replace("_train_data.csv", "")

    X_test_seq = X_test_seqs[file]

    for pollutant in ["O3_target", "NO2_target"]:
        poll_short = pollutant.replace("_target", "")

        y_test = y_test_seqs[(file, pollutant)]
        y_test_inv = scaler_ys[(file, pollutant)].inverse_transform(y_test)

        model_dir = f"saved_models/{site_name}"

        model_paths = {
            "LSTM": os.path.join(model_dir, f"{site_name}_{poll_short}_LSTM.keras"),
            "GRU": os.path.join(model_dir, f"{site_name}_{poll_short}_GRU.keras"),
            "Transformer": os.path.join(model_dir, f"{site_name}_{poll_short}_TRANS.keras")
        }

        preds = {}

        for model_name, path in model_paths.items():
            model = load_model(path, compile=False)
            y_pred_scaled = model.predict(X_test_seq)
            y_pred = scaler_ys[(file, pollutant)].inverse_transform(y_pred_scaled)
            preds[model_name] = y_pred.flatten()

        # Compute ensemble
        y_pred_ensemble = (
            0.4 * preds["GRU"] +
            0.4 * preds["LSTM"] +
            0.2 * preds["Transformer"]
        )

        preds["Ensemble"] = y_pred_ensemble

        # Store metrics
        for model_name, pred in preds.items():
            rmse = np.sqrt(mean_squared_error(y_test_inv, pred))   # FIXED
            mae = mean_absolute_error(y_test_inv, pred)
            r2 = r2_score(y_test_inv, pred)

            final_results.append({
                "Site": file,
                "Pollutant": pollutant,
                "Model": model_name,
                "RMSE": rmse,
                "MAE": mae,
                "R2": r2
            })


final_df = pd.DataFrame(final_results)
final_df.to_csv("FINAL_MODEL_METRICS.csv", index=False)
print("DONE — full metrics saved!")


[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
[1m169/169[0m [32m━━━━

In [None]:
import pandas as pd

results_df = pd.DataFrame(final_results)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print(results_df.to_string())

                     Site   Pollutant        Model       RMSE        MAE        R2
0   site_1_train_data.csv   O3_target         LSTM   6.454074   3.224753  0.934928
1   site_1_train_data.csv   O3_target          GRU   5.884445   3.092693  0.945908
2   site_1_train_data.csv   O3_target  Transformer  12.180492   7.178348  0.768232
3   site_1_train_data.csv   O3_target     Ensemble   6.436671   3.474949  0.935279
4   site_1_train_data.csv  NO2_target         LSTM   9.345092   5.483871  0.868362
5   site_1_train_data.csv  NO2_target          GRU   9.161602   5.263898  0.873481
6   site_1_train_data.csv  NO2_target  Transformer  16.463231  11.229938  0.591451
7   site_1_train_data.csv  NO2_target     Ensemble   9.712048   5.852114  0.857821
8   site_2_train_data.csv   O3_target         LSTM   7.696978   5.054512  0.915452
9   site_2_train_data.csv   O3_target          GRU   7.569438   4.739593  0.918231
10  site_2_train_data.csv   O3_target  Transformer  16.691892  11.644270  0.602377
11  

*GRAPH FOR RESEARCH PAPER*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df = results_df.copy()
df["Site"] = df["Site"].str.replace("_train_data.csv", "")

sns.set_style("whitegrid")

colors = ["#005f73", "#0a9396"]   # RMSE, MAE
color_r2 = "#94d2bd"

# Helper: treat NaN as 0 but hide on plot
def safe_value(val):
    return 0 if pd.isna(val) else val


def plot_rmse_mae(site_name):
    site_df = df[df["Site"] == site_name].copy()

    site_df["RMSE"] = site_df["RMSE"].apply(safe_value)
    site_df["MAE"]  = site_df["MAE"].apply(safe_value)

    melt_df = site_df.melt(
        id_vars=["Model"],
        value_vars=["RMSE", "MAE"],
        var_name="Metric",
        value_name="Value"
    )

    plt.figure(figsize=(10, 6))
    ax = sns.barplot(
        data=melt_df,
        x="Model",
        y="Value",
        hue="Metric",
        palette=colors,
        errorbar=None,
        width=0.5     # REDUCED BAR WIDTH
    )

    # Add value labels
    for p in ax.patches:
        val = p.get_height()
        if val != 0:
            ax.annotate(
                f"{val:.2f}",
                (p.get_x() + p.get_width() / 2, val),
                ha="center",
                va="bottom",
                fontsize=11,
                fontweight="bold"
            )

    plt.title(f"{site_name}", fontsize=18, fontweight="bold")
    plt.xlabel("Model", fontsize=14, fontweight="bold")
    plt.ylabel("Error Value", fontsize=14, fontweight="bold")

    plt.xticks(fontsize=12, fontweight="bold")
    plt.yticks(fontsize=12, fontweight="bold")

    plt.legend(title="Metric", title_fontsize=12, fontsize=12)
    plt.tight_layout()
    plt.show()



def plot_r2(site_name):
    site_df = df[df["Site"] == site_name].copy()
    site_df["R2"] = site_df["R2"].apply(safe_value)

    plt.figure(figsize=(8, 6))
    ax = sns.barplot(
        data=site_df,
        x="Model",
        y="R2",
        color=color_r2,
        errorbar=None,
        width=0.5     # REDUCED BAR WIDTH
    )

    for p in ax.patches:
        val = p.get_height()
        if val != 0:
            ax.annotate(
                f"{val:.2f}",
                (p.get_x() + p.get_width() / 2, val),
                ha="center",
                va="bottom",
                fontsize=11,
                fontweight="bold"
            )

    plt.title(f"{site_name}", fontsize=18, fontweight="bold")
    plt.xlabel("Model", fontsize=14, fontweight="bold")
    plt.ylabel("R² Score", fontsize=14, fontweight="bold")

    plt.xticks(fontsize=12, fontweight="bold")
    plt.yticks(fontsize=12, fontweight="bold")

    plt.tight_layout()
    plt.show()



# Run for all 3 sites
plot_rmse_mae("site_1")
plot_r2("site_1")

plot_rmse_mae("site_2")
plot_r2("site_2")

plot_rmse_mae("site_3")
plot_r2("site_3")


Saved: Saved_Graphs/site_1_RMSE_MAE.png
Saved: Saved_Graphs/site_1_R2.png
Saved: Saved_Graphs/site_2_RMSE_MAE.png
Saved: Saved_Graphs/site_2_R2.png
Saved: Saved_Graphs/site_3_RMSE_MAE.png
Saved: Saved_Graphs/site_3_R2.png


In [None]:
import shutil

shutil.make_archive("Saved_Graphs", "zip", "Saved_Graphs")
print("ZIP file created: Saved_Graphs.zip")


ZIP file created: Saved_Graphs.zip


In [None]:
from google.colab import files
files.download("Saved_Graphs.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

# Load your saved metrics file
df = pd.read_csv("model_metrics_summary.csv")

# Clean up names
df['Site'] = df['Site'].str.replace("_train_data.csv", "")
df['Pollutant'] = df['Pollutant'].str.replace("_target", "")

# Sort the table correctly
df = df.sort_values(by=["Site", "Pollutant", "Model"])

# Function to print grouped table
def print_grouped_table(df):
    sites = df['Site'].unique()

    for site in sites:
        site_df = df[df['Site'] == site]

        print("\n" + "="*70)
        print(f"{site.upper():^70}")
        print("="*70)

        print(f"{'Site':<12} {'Pollutant':<12} {'Model':<15} {'RMSE':<10} {'MAE':<10} {'R2':<10}")
        print("-"*70)

        first_site_row = True
        for pollutant in site_df['Pollutant'].unique():
            pollutant_df = site_df[site_df['Pollutant'] == pollutant]

            first_pollutant_row = True
            for _, row in pollutant_df.iterrows():
                print(
                    f"{site if first_site_row else '':<12} "
                    f"{pollutant if first_pollutant_row else '':<12} "
                    f"{row['Model']:<15} "
                    f"{row['RMSE']:<10.3f} "
                    f"{row['MAE']:<10.3f} "
                    f"{row['R2']:<10.3f}"
                )
                first_site_row = False
                first_pollutant_row = False
        print("\n")

# Print the formatted table
print_grouped_table(df)



                                SITE_1                                
Site         Pollutant    Model           RMSE       MAE        R2        
----------------------------------------------------------------------
site_1       NO2          Ensemble        9.712      5.852      0.858     
             O3           Ensemble        6.437      3.475      0.935     



                                SITE_2                                
Site         Pollutant    Model           RMSE       MAE        R2        
----------------------------------------------------------------------
site_2       NO2          Ensemble        8.162      5.637      0.852     
             O3           Ensemble        8.111      5.318      0.906     



                                SITE_3                                
Site         Pollutant    Model           RMSE       MAE        R2        
----------------------------------------------------------------------
site_3       NO2          Ensemble        

In [None]:
import pandas as pd
df = pd.read_csv("FINAL_MODEL_METRICS.csv")

df['Site'] = df['Site'].str.replace("_train_data.csv", "")
df['Pollutant'] = df['Pollutant'].str.replace("_target","")

df.head()

Unnamed: 0,Site,Pollutant,Model,RMSE,MAE,R2
0,site_1,O3,LSTM,6.454074,3.224753,0.934928
1,site_1,O3,GRU,5.884445,3.092693,0.945908
2,site_1,O3,Transformer,12.180492,7.178348,0.768232
3,site_1,O3,Ensemble,6.436671,3.474949,0.935279
4,site_1,NO2,LSTM,9.345092,5.483871,0.868362


In [None]:
def print_grouped_table(df):
    sites = df['Site'].unique()

    for site in sites:
        site_df = df[df['Site'] == site]

        print("\n" + "="*70)
        print(f"{site.upper():^70}")
        print("="*70)

        print(f"{'Site':<12} {'Pollutant':<12} {'Model':<15} {'RMSE':<10} {'MAE':<10} {'R2':<10}")
        print("-"*70)

        first_site_row = True
        for pollutant in site_df['Pollutant'].unique():
            pollutant_df = site_df[site_df['Pollutant'] == pollutant]

            first_pollutant_row = True
            for _, row in pollutant_df.iterrows():
                print(
                    f"{site if first_site_row else '':<12} "
                    f"{pollutant if first_pollutant_row else '':<12} "
                    f"{row['Model']:<15} "
                    f"{row['RMSE']:<10.3f} "
                    f"{row['MAE']:<10.3f} "
                    f"{row['R2']:<10.3f}"
                )
                first_site_row = False
                first_pollutant_row = False
        print("\n")

print_grouped_table(df)



                                SITE_1                                
Site         Pollutant    Model           RMSE       MAE        R2        
----------------------------------------------------------------------
site_1       O3           LSTM            6.454      3.225      0.935     
                          GRU             5.884      3.093      0.946     
                          Transformer     12.180     7.178      0.768     
                          Ensemble        6.437      3.475      0.935     
             NO2          LSTM            9.345      5.484      0.868     
                          GRU             9.162      5.264      0.873     
                          Transformer     16.463     11.230     0.591     
                          Ensemble        9.712      5.852      0.858     



                                SITE_2                                
Site         Pollutant    Model           RMSE       MAE        R2        
---------------------------------

In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [None]:
from docx import Document
from docx.shared import Inches
import pandas as pd

df = pd.read_csv("FINAL_MODEL_METRICS.csv")

df['Site'] = df['Site'].str.replace("_train_data.csv", "")
df['Pollutant'] = df['Pollutant'].str.replace("_target","")

document = Document()

sites = df['Site'].unique()

for site in sites:
    document.add_heading(f"Performance Metrics — {site.upper()}", level=1)

    site_df = df[df['Site'] == site]

    table = document.add_table(rows=1, cols=6)
    hdr = table.rows[0].cells
    hdr[0].text = "Site"
    hdr[1].text = "Pollutant"
    hdr[2].text = "Model"
    hdr[3].text = "RMSE"
    hdr[4].text = "MAE"
    hdr[5].text = "R²"

    for _, row in site_df.iterrows():
        row_cells = table.add_row().cells
        row_cells[0].text = row["Site"]
        row_cells[1].text = row["Pollutant"]
        row_cells[2].text = row["Model"]
        row_cells[3].text = f"{row['RMSE']:.3f}"
        row_cells[4].text = f"{row['MAE']:.3f}"
        row_cells[5].text = f"{row['R2']:.3f}"

    document.add_page_break()

document.save("MODEL_METRICS_TABLE.docx")
print("Saved MODEL_METRICS_TABLE.docx")


Saved MODEL_METRICS_TABLE.docx
