### Imports

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('processed_data/AQI_data_imputed.csv')

In [4]:
df.isna().sum()

City          0
Date          0
PM2.5         0
PM10          0
NO            0
NO2           0
NOx           0
NH3           0
CO            0
SO2           0
O3            0
Benzene       0
Toluene       0
Xylene        0
AQI           0
AQI_Bucket    0
dtype: int64

### ML Model - XGBoost

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [6]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-29,83.13,96.18,6.93,28.71,33.72,16.31,6.93,49.52,59.76,0.02,0.0,3.14,209.0,Poor
1,Ahmedabad,2015-01-30,79.84,96.18,13.85,28.68,41.08,16.31,13.85,48.49,97.07,0.04,0.0,4.81,328.0,Very Poor
2,Ahmedabad,2015-01-31,94.52,96.18,24.39,32.66,52.61,16.31,24.39,67.39,111.33,0.24,0.01,7.67,514.0,Severe
3,Ahmedabad,2015-02-01,135.99,96.18,43.48,42.08,84.57,16.31,43.48,75.23,102.7,0.4,0.04,25.87,782.0,Severe
4,Ahmedabad,2015-02-02,178.33,96.18,54.56,35.31,72.8,16.31,54.56,55.04,107.38,0.46,0.06,35.61,914.0,Severe


In [7]:
# Create a LabelEncoder
le = LabelEncoder()
# Columns to encode
columns_to_encode = ['City', 'AQI_Bucket']
# Encode specified columns
for column in columns_to_encode:
    df[column] = le.fit_transform(df[column])

# Split dataset into features (X) and target variable (y)
X = df.drop(columns=['AQI_Bucket','Date','AQI', 'City']) 
y = df['AQI_Bucket']

# Apply a log transformation to your features (X)
X_log_transformed = np.log1p(X)

# Split the log-transformed data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X_log_transformed, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [8]:
X_test.head()
# XGBoost seems to deal with NaN values better than me imputing them in

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene
7125,3.769768,4.576565,1.680828,2.7985,2.845491,2.851284,0.708036,1.781709,3.613347,0.565314,1.111858,0.883768
20041,4.53657,5.129425,3.201933,3.921973,4.305011,3.074543,0.770108,2.532903,3.417727,1.915451,1.520607,0.883768
3256,4.068343,4.430698,2.332144,2.399712,3.008155,2.560323,0.48858,2.024193,3.473518,1.121678,1.163151,1.098612
18021,3.830596,4.576565,1.868721,1.987874,1.166271,2.851284,1.018847,1.94591,3.981362,1.166271,0.631272,0.883768
5140,3.287655,4.154655,2.283402,3.369018,3.439777,2.595255,0.770108,2.142416,3.529591,0.667829,3.565581,0.883768


In [9]:
# XGBoost model
from xgboost import XGBClassifier

# Create an XGBoost classification model
model = XGBClassifier() 

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)
# Prediction on training data
y_pred_train = model.predict(X_train)

# Calculate accuracy of training set
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy for training set: {accuracy_train}")
print(classification_report(y_train, y_pred_train))
print("----------------------------------------------------------")
# Evaluate the model's performance using accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for testing set: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy for training set: 0.9584004024144869
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1059
           1       0.95      0.95      0.95      7084
           2       0.97      0.92      0.95      2226
           3       0.94      0.97      0.96      6569
           4       1.00      1.00      1.00      1067
           5       0.99      0.99      0.99      1875

    accuracy                           0.96     19880
   macro avg       0.97      0.96      0.97     19880
weighted avg       0.96      0.96      0.96     19880

----------------------------------------------------------
Accuracy for testing set: 0.8088531187122736
              precision    recall  f1-score   support

           0       0.85      0.68      0.76       282
           1       0.82      0.84      0.83      1745
           2       0.67      0.66      0.66       555
           3       0.84      0.87      0.86      1655
           4       0.84      0.80   

In [10]:
# Access hyperparameters of the XGBoost model
learning_rate = model.get_params()['learning_rate']
max_depth = model.get_params()['max_depth']
n_estimators = model.get_params()['n_estimators']

# Print the current hyperparameter values
print(f"Current Learning Rate: {learning_rate}")
print(f"Current Max Depth: {max_depth}")
print(f"Current Max Depth: {n_estimators}")

# Print values will be None if default values left in the model

Current Learning Rate: None
Current Max Depth: None
Current Max Depth: None


In [11]:
# With Hyperparameter tuning and feature selection

# Use certain columns
X = X[['PM2.5','PM10','NO','NO2','CO','NOx','SO2']]
# Apply a log transformation to your features (X)
X_log_transformed = np.log1p(X)
# Split the log-transformed data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X_log_transformed, y, test_size=0.2, random_state=42)

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# the XGBoost model
model = XGBClassifier()

# Define the hyperparameter search space
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'max_depth': range(2, 5) # 10
    # Add more hyperparameters here
    # max_node
}

# Define the number of random iterations
n_iter = 5  

# Perform the random search
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=n_iter,
    scoring='accuracy',  # Use the appropriate scoring metric
    cv=5,  # Number of cross-validation folds
    #n_jobs=-1  # Use all available CPU cores for parallel processing
)

# Fit the random search on your training data
random_search.fit(X_train, y_train)

# Get the best estimator (model) with the best hyperparameters
best_model = random_search.best_estimator_

# Print the best hyperparameters and corresponding performance
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Accuracy: ", random_search.best_score_)

# Make predictions on the training and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)


# Calculate accuracy of training set
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f"Accuracy for training set: {accuracy_train}")
print(classification_report(y_train, y_pred_train))
print("----------------------------------------------------------")
# Calculate accuracy of testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy for testing set: {test_accuracy}")
print(classification_report(y_test, y_pred))

Best Hyperparameters:  {'max_depth': 3, 'learning_rate': 0.5}
Best Accuracy:  0.7831991951710261
Accuracy for training set: 0.8372233400402415
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1059
           1       0.95      0.95      0.95      7084
           2       0.97      0.92      0.95      2226
           3       0.94      0.97      0.96      6569
           4       1.00      1.00      1.00      1067
           5       0.99      0.99      0.99      1875

    accuracy                           0.96     19880
   macro avg       0.97      0.96      0.97     19880
weighted avg       0.96      0.96      0.96     19880

----------------------------------------------------------
Accuracy for testing set: 0.7806841046277666
              precision    recall  f1-score   support

           0       0.85      0.68      0.76       282
           1       0.82      0.84      0.83      1745
           2       0.67      0.66      0.66    

In [12]:
# Access hyperparameters of the XGBoost model
learning_rate = best_model.get_params()['learning_rate']
max_depth = best_model.get_params()['max_depth']
n_estimators = best_model.get_params()['n_estimators']

# Print the current hyperparameter values
print(f"Current Learning Rate: {learning_rate}")
print(f"Current Max Depth: {max_depth}")
print(f"Current Max Depth: {n_estimators}")

Current Learning Rate: 0.5
Current Max Depth: 3
Current Max Depth: None


In [13]:
# With Hyperparameter tuning 


# Split dataset into features (X) and target variable (y)
X = df.drop(columns=['AQI_Bucket','Date','AQI', 'City']) 
y = df['AQI_Bucket']

# Apply a log transformation to your features (X)
X_log_transformed = np.log1p(X)
# Split the log-transformed data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X_log_transformed, y, test_size=0.2, random_state=42)

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# the XGBoost model
model = XGBClassifier()

# Define the hyperparameter search space
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'max_depth': range(2, 5) # 10
    # Add more hyperparameters here
    # max_node
}

# Define the number of random iterations
n_iter = 5  

# Perform the random search
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=n_iter,
    scoring='accuracy',  # Use the appropriate scoring metric
    cv=5,  # Number of cross-validation folds
    #n_jobs=-1  # Use all available CPU cores for parallel processing
)

# Fit the random search on your training data
random_search.fit(X_train, y_train)

# Get the best estimator (model) with the best hyperparameters
best_model = random_search.best_estimator_

# Print the best hyperparameters and corresponding performance
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Accuracy: ", random_search.best_score_)

# Make predictions on the training and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)


# Calculate accuracy of training set
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f"Accuracy for training set: {accuracy_train}")
print(classification_report(y_train, y_pred_train))
print("----------------------------------------------------------")
# Calculate accuracy of testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy for testing set: {test_accuracy}")
print(classification_report(y_test, y_pred))

Best Hyperparameters:  {'max_depth': 4, 'learning_rate': 0.5}
Best Accuracy:  0.8014084507042254
Accuracy for training set: 0.9131287726358149
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1059
           1       0.95      0.95      0.95      7084
           2       0.97      0.92      0.95      2226
           3       0.94      0.97      0.96      6569
           4       1.00      1.00      1.00      1067
           5       0.99      0.99      0.99      1875

    accuracy                           0.96     19880
   macro avg       0.97      0.96      0.97     19880
weighted avg       0.96      0.96      0.96     19880

----------------------------------------------------------
Accuracy for testing set: 0.8024144869215292
              precision    recall  f1-score   support

           0       0.85      0.68      0.76       282
           1       0.82      0.84      0.83      1745
           2       0.67      0.66      0.66    