In [1]:
# Import libraries
import pandas as pd

# Load the dataset
data = pd.read_csv("city_day.csv")

# Rename columns for easier handling
data.columns = [
    "City", "Date", "PM2_5", "PM10", "NO", "NO2", "NOx", "NH3", "CO",
    "SO2", "O3", "Benzene", "Toluene", "Xylene", "AQI", "AQI_Bucket"
]

# Preview data
print("Data loaded successfully")
print(data.head())


Data loaded successfully
        City        Date  PM2_5  PM10     NO    NO2    NOx  NH3     CO    SO2  \
0  Ahmedabad  2015-01-01    NaN   NaN   0.92  18.22  17.15  NaN   0.92  27.64   
1  Ahmedabad  2015-01-02    NaN   NaN   0.97  15.69  16.46  NaN   0.97  24.55   
2  Ahmedabad  2015-01-03    NaN   NaN  17.40  19.30  29.70  NaN  17.40  29.07   
3  Ahmedabad  2015-01-04    NaN   NaN   1.70  18.48  17.97  NaN   1.70  18.59   
4  Ahmedabad  2015-01-05    NaN   NaN  22.10  21.42  37.76  NaN  22.10  39.33   

       O3  Benzene  Toluene  Xylene  AQI AQI_Bucket  
0  133.36     0.00     0.02    0.00  NaN        NaN  
1   34.06     3.68     5.50    3.77  NaN        NaN  
2   30.70     6.80    16.40    2.25  NaN        NaN  
3   36.08     4.43    10.14    1.00  NaN        NaN  
4   39.31     7.01    18.89    2.78  NaN        NaN  


In [2]:
# Count missing values in each column
missing_values = data.isnull().sum()
print("Missing values before handling:\n", missing_values)

# List of numeric pollutant columns
numeric_cols = [
    "PM2_5", "PM10", "NO", "NO2", "NOx", "NH3", "CO",
    "SO2", "O3", "Benzene", "Toluene", "Xylene", "AQI"
]

# Fill numeric missing values with median
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Fill categorical missing values in AQI_Bucket with mode
data['AQI_Bucket'] = data['AQI_Bucket'].fillna(data['AQI_Bucket'].mode()[0])

# Verify that there are no missing values now
print("Missing values after handling:\n", data.isnull().sum())


Missing values before handling:
 City              0
Date              0
PM2_5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64
Missing values after handling:
 City          0
Date          0
PM2_5         0
PM10          0
NO            0
NO2           0
NOx           0
NH3           0
CO            0
SO2           0
O3            0
Benzene       0
Toluene       0
Xylene        0
AQI           0
AQI_Bucket    0
dtype: int64


In [3]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Initialize label encoders
le_city = LabelEncoder()
le_aqi_bucket = LabelEncoder()

# Fit and transform City and AQI_Bucket columns
data['City'] = le_city.fit_transform(data['City'])
data['AQI_Bucket'] = le_aqi_bucket.fit_transform(data['AQI_Bucket'])

# Save the label encoders for deployment
joblib.dump(le_city, "le_city.pkl")
joblib.dump(le_aqi_bucket, "le_aqi_bucket.pkl")

print("Categorical variables encoded and label encoders saved.")


Categorical variables encoded and label encoders saved.


In [4]:
# Convert 'Date' to datetime datatype
data['Date'] = pd.to_datetime(data['Date'])

# Create pollutant ratio features to capture relative levels
data['PM_ratio'] = data['PM2_5'] / (data['PM10'] + 1e-6)  # avoid division by zero
data['NO_ratio'] = data['NOx'] / (data['NO2'] + 1e-6)

# Extract day of week from the 'Date' column (0=Monday, ..., 6=Sunday)
data['Day_of_week'] = data['Date'].dt.dayofweek

# Calculate 3-day rolling averages for selected pollutants
pollutants = [
    'PM2_5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3',
    'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene'
]

for pollutant in pollutants:
    data[f'{pollutant}_3d_avg'] = data.groupby('City')[pollutant].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean()
    )

print("Feature engineering completed.")


Feature engineering completed.


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# Exclude columns not used as features
X = data.drop(columns=['Date', 'AQI_Bucket', 'AQI'])
y = data['AQI_Bucket']

# Identify numeric columns for scaling
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Initialize and fit scaler
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Save scaler and numeric columns for deployment
joblib.dump(scaler, "feature_scaler.pkl")
joblib.dump(list(numeric_cols), "numeric_columns.pkl")

# Split into train and test sets (use stratify to maintain class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.35, random_state=42, stratify=y
)

print("Feature preparation, scaling, and train-test split complete.")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")



Feature preparation, scaling, and train-test split complete.
Train shape: (19195, 28), Test shape: (10336, 28)


In [6]:
# Import models
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

# Initialize individual classifiers
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
lgb_model = LGBMClassifier(random_state=42)
cat_model = CatBoostClassifier(verbose=0, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=400, random_state=42)

# Combine models into a Voting Classifier (soft voting)
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model),
        ('mlp', mlp_model)
    ],
    voting='soft'
)

# Train the ensemble on training data
voting_model.fit(X_train, y_train)

print("Voting ensemble model trained successfully.")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6664
[LightGBM] [Info] Number of data points in the train set: 19195, number of used features: 28
[LightGBM] [Info] Start training from score -3.091616
[LightGBM] [Info] Start training from score -0.782060
[LightGBM] [Info] Start training from score -2.362429
[LightGBM] [Info] Start training from score -1.278488
[LightGBM] [Info] Start training from score -3.093912
[LightGBM] [Info] Start training from score -2.536598
Voting ensemble model trained successfully.


In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test data
y_pred = voting_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy on test set: 0.8437
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.70      0.75       469
           1       0.89      0.88      0.89      4729
           2       0.75      0.68      0.71       973
           3       0.83      0.87      0.85      2879
           4       0.83      0.81      0.82       468
           5       0.76      0.82      0.79       818

    accuracy                           0.84     10336
   macro avg       0.81      0.79      0.80     10336
weighted avg       0.84      0.84      0.84     10336

Confusion Matrix:
[[ 328   13    0  128    0    0]
 [  10 4162  130  392   14   21]
 [   0  196  658    1    3  115]
 [  63  292    6 2517    0    1]
 [   0    5    5    0  381   77]
 [   0    7   78    0   59  674]]


In [8]:
import joblib

# Save the trained model
joblib.dump(voting_model, "stacking_ensemble.pkl")

# Save the scaler used for feature scaling
joblib.dump(scaler, "feature_scaler.pkl")

# Save the list of numeric feature columns
joblib.dump(list(numeric_cols), "numeric_columns.pkl")

# Save label encoders for City and AQI_Bucket
joblib.dump(le_city, "le_city.pkl")
joblib.dump(le_aqi_bucket, "le_aqi_bucket.pkl")

print("Model, scaler, numeric columns, and label encoders saved successfully.")


Model, scaler, numeric columns, and label encoders saved successfully.
