# Libraries

In [64]:
import numpy as np
import matplotlib.pyplot as mp
import pandas as pd
import seaborn as sns
from pyESN import ESN
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
import itertools
from collections import Counter
import mlflow
import pickle

# Data

In [30]:
data=pd.read_excel("/workspaces/Time-series-prediction-for-pollution-data/air_pollution_data.xlsx")

In [31]:
data

Unnamed: 0,city,date,aqi,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,Ahmedabad,2020-11-30,5,520.71,2.38,16.28,130.18,47.68,65.96,72.13,8.36
1,Ahmedabad,2020-12-01,5,1682.28,7.71,54.84,0.73,21.70,120.95,154.53,27.36
2,Ahmedabad,2020-12-02,5,1815.80,16.54,49.35,0.17,23.84,133.47,172.63,28.12
3,Ahmedabad,2020-12-03,5,2296.45,41.57,40.10,0.00,35.76,150.37,202.15,36.48
4,Ahmedabad,2020-12-04,5,2189.64,23.92,58.95,0.02,28.13,160.79,205.80,40.53
...,...,...,...,...,...,...,...,...,...,...,...
23499,Visakhapatnam,2023-05-21,3,353.81,0.00,2.08,100.14,4.11,37.53,47.09,0.08
23500,Visakhapatnam,2023-05-22,3,380.52,0.00,3.77,82.97,5.07,32.17,43.44,1.74
23501,Visakhapatnam,2023-05-23,3,390.53,0.00,4.28,80.11,5.19,36.01,48.06,1.20
23502,Visakhapatnam,2023-05-24,3,300.41,0.00,1.36,95.84,2.21,30.17,48.89,0.00


## sorting cities

In [32]:
unique_city = data["city"].unique()

# Sorting cities alphabetically
unique_city= sorted(unique_city)

In [33]:
#dict to hold city wise data
city_df = {}

In [34]:
for city in data["city"].unique():
    city_df[city] = data[data["city"] == city].copy()

In [35]:
ahmedabad_df=data[data["city"] == "Ahmedabad"].copy()

x = ahmedabad_df.drop(columns=["city", "aqi","date"])
y = ahmedabad_df["aqi"]

In [36]:
y

0      5
1      5
2      5
3      5
4      5
      ..
899    3
900    3
901    3
902    3
903    3
Name: aqi, Length: 904, dtype: int64

## Scaling 
we have already used the same scaler in pollutant model training


In [37]:
with open("/workspaces/Time-series-prediction-for-pollution-data/models/minmaxscaling.pkl", "rb") as f:
    scaler = pickle.load(f)

# Apply same transformation as in previous notebook
x_scaled = scaler.transform(x)

# data prep for model 

In [38]:
features=ahmedabad_df.columns
features=[f for f in features if f != "aqi" and f!= "date" and f!="city"] #all except aqi column

In [39]:
lookback = 5
forecast_horizon = 20 # in previous notebook we have looked ahead 20 timestamps
steps_needed = 6      # will predict for 6 AQI heads

In [40]:
X_input = [x_scaled[i+lookback-1, :] for i in range(len(x_scaled) - lookback - forecast_horizon)]
X_input = np.array(X_input)  

In [41]:
predicted_feature_steps = {}

In [42]:
for feature in features:
    # Loading model
    with open(f"/workspaces/Time-series-prediction-for-pollution-data/models/{feature}_esn_model.pkl", "rb") as f:
        esn = pickle.load(f)
    
    # Predict for entire dataset
    feature_pred = esn.predict(X_input, len(X_input))  # shape: (samples, forecast_horizon)
    
    # Keep only first 6 steps
    predicted_feature_steps[feature] = feature_pred[:, :steps_needed]  # (samples, 6)

input variable final (X)

In [43]:
X = []  #final feature space initialised

for i in range(len(X_input)):
    one_sample = []
    for step in range(steps_needed):  # t+1 to t+6 6 steps ahead in time for AQI prediction
        for feature in features:
            one_sample.append(predicted_feature_steps[feature][i, step])
    X.append(one_sample)

X = np.array(X)  # shape: (samples, 6 × no. of features)

In [47]:
aqi_values = y.values  # categorical AQI values, like 0–5, unscaled

preparing target variable (Y)

In [49]:
Y = []

for i in range(len(x_scaled) - lookback - forecast_horizon):
    future_aqi = aqi_values[i+lookback:i+lookback+steps_needed]
    Y.append(future_aqi)

Y = np.array(Y,dtype=int)  # shape: (samples, 6)

In [50]:
Y

array([[5, 5, 5, 5, 5, 5],
       [5, 5, 5, 5, 5, 5],
       [5, 5, 5, 5, 5, 5],
       ...,
       [4, 5, 2, 3, 3, 3],
       [5, 2, 3, 3, 3, 3],
       [2, 3, 3, 3, 3, 3]], shape=(879, 6))

## post train test split

In [52]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)

In [57]:
y_train = (y_train - 1).astype(int)
y_test = (y_test - 1).astype(int)

In [62]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))


(array([0, 1, 2, 3, 4]), array([ 132,  612,  282,  876, 2316]))
(array([0, 1, 2, 3, 4]), array([ 12,  66, 165, 198, 615]))


## Handle class imbalance

In [65]:
# class frequencies in y_train for t+1
class_counts = Counter(y_train[:, 0])
print("Class counts:", class_counts)

# inverse weights
total = sum(class_counts.values())
class_weights = {cls: total / count for cls, count in class_counts.items()}
print("Class weights:", class_weights)

# weight array
sample_weights = np.array([class_weights[label] for label in y_train[:, 0]])

Class counts: Counter({np.int64(4): 386, np.int64(3): 146, np.int64(1): 102, np.int64(2): 47, np.int64(0): 22})
Class weights: {np.int64(4): 1.8212435233160622, np.int64(3): 4.815068493150685, np.int64(1): 6.892156862745098, np.int64(0): 31.954545454545453, np.int64(2): 14.957446808510639}


# Model

mlflow setup

In [54]:
mlflow.set_tracking_uri("/workspaces/Time-series-prediction-for-pollution-data/mlruns")  
mlflow.set_experiment("AQI_Prediction_XGBoost")

<Experiment: artifact_location='/workspaces/Time-series-prediction-for-pollution-data/mlruns/273469853540485859', creation_time=1752752938314, experiment_id='273469853540485859', last_update_time=1752752938314, lifecycle_stage='active', name='AQI_Prediction_XGBoost', tags={}>

defining hyper params

In [66]:
# Hyperparams
n_estimators_list = [300, 200]
max_depth_list = [4, 6,10]
learning_rate_list = [0.05, 0.1]

In [67]:
for n_estimators, max_depth, learning_rate in itertools.product(n_estimators_list, max_depth_list, learning_rate_list):

    with mlflow.start_run(run_name=f"xgb_est{n_estimators}_depth{max_depth}_lr{learning_rate}"):

        # Train the model
        base_model = XGBClassifier(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  learning_rate=learning_rate,
                                  objective="multi:softprob",
                                  num_class=5  # 5 AQI categories
                                  )
        model = MultiOutputClassifier(base_model)
        model.fit(x_train, y_train,sample_weight=sample_weights)

        # Predict
        y_pred = model.predict(x_test)

        # Logging hyperparameters
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("learning_rate", learning_rate)

        # Evaluate and log metrics for each step 
        for step in range(Y.shape[1]):
            acc = accuracy_score(y_test[:, step], y_pred[:, step])
            mlflow.log_metric(f"accuracy_t{step+1}", acc)    
            report = classification_report(y_test[:, step], y_pred[:, step], output_dict=True,zero_division=0)
            mlflow.log_metric(f"f1_macro_t{step+1}", report["macro avg"]["f1-score"])
            mlflow.log_metric(f"recall_macro_t{step+1}", report["macro avg"]["recall"])   
  

        # Save model
        mlflow.sklearn.log_model(model, artifact_path="xgboostmodel")

        print(f"Logged run with est={n_estimators}, depth={max_depth}, lr={learning_rate}")



Logged run with est=300, depth=4, lr=0.05




Logged run with est=300, depth=4, lr=0.1




Logged run with est=300, depth=6, lr=0.05




Logged run with est=300, depth=6, lr=0.1




Logged run with est=300, depth=10, lr=0.05




Logged run with est=300, depth=10, lr=0.1




Logged run with est=200, depth=4, lr=0.05




Logged run with est=200, depth=4, lr=0.1




Logged run with est=200, depth=6, lr=0.05




Logged run with est=200, depth=6, lr=0.1




Logged run with est=200, depth=10, lr=0.05




Logged run with est=200, depth=10, lr=0.1


In [70]:
experiment_id = mlflow.get_experiment_by_name("AQI_Prediction_XGBoost").experiment_id
df_mlflow = mlflow.search_runs(experiment_ids=[experiment_id])
df_mlflow.to_csv("/workspaces/Time-series-prediction-for-pollution-data/logs/mlflow_summary_baseline.csv", index=False)

In [73]:
best_run = df_mlflow.sort_values("metrics.f1_macro_t2", ascending=False).iloc[0]
best_run.to_frame().to_csv("/workspaces/Time-series-prediction-for-pollution-data/logs/best_run_baseline.csv")