### Imports

In [8]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_csv('processed_data/AQI_data_imputed.csv')

### ML Model - Random Forest

In [10]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder
le = LabelEncoder()

# List of columns to encode
columns_to_encode = ['City', 'AQI_Bucket']

# Encode the specified columns
for column in columns_to_encode:
    df[column] = le.fit_transform(df[column])

# Split your dataset into features (X) and the target variable (y)
X = df.drop(columns=['AQI_Bucket','Date','AQI', 'City'])  # Features
y = df['AQI_Bucket']  # Target variable

# Apply a log transformation to your features (X)
X_log_transformed = np.log1p(X)

# Split the log-transformed data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X_log_transformed, y, test_size=0.2, random_state=42)


In [17]:
df

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,0,2015-01-29,83.13,96.18,6.93,28.71,33.72,16.31,6.93,49.52,59.76,0.02,0.000,3.14,209.0,2
1,0,2015-01-30,79.84,96.18,13.85,28.68,41.08,16.31,13.85,48.49,97.07,0.04,0.000,4.81,328.0,5
2,0,2015-01-31,94.52,96.18,24.39,32.66,52.61,16.31,24.39,67.39,111.33,0.24,0.010,7.67,514.0,4
3,0,2015-02-01,135.99,96.18,43.48,42.08,84.57,16.31,43.48,75.23,102.70,0.40,0.040,25.87,782.0,4
4,0,2015-02-02,178.33,96.18,54.56,35.31,72.80,16.31,54.56,55.04,107.38,0.46,0.060,35.61,914.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24845,25,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.070,0.73,41.0,0
24846,25,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.210,0.38,70.0,3
24847,25,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.010,0.00,68.0,3
24848,25,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.000,0.00,54.0,3


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the classifier and fit it to the imputed training data
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Make predictions on the train data
y_train_pred = clf.predict(X_train)

accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_pred)

report_train = classification_report(y_train, y_train_pred)
report_test = classification_report(y_test, y_pred)

print(f"Train set Accuracy:{accuracy_train}")
print(f"Train set report:{report_train}")
print("------------------------------------")
print(f"Test set Accuracy:{accuracy_test}")
print(f"Test set report:{report_test}")

Train set Accuracy:0.9987927565392354
Train set report:              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1059
           1       1.00      1.00      1.00      7084
           2       1.00      1.00      1.00      2226
           3       1.00      1.00      1.00      6569
           4       1.00      1.00      1.00      1067
           5       1.00      1.00      1.00      1875

    accuracy                           1.00     19880
   macro avg       1.00      1.00      1.00     19880
weighted avg       1.00      1.00      1.00     19880

------------------------------------
Test set Accuracy:0.8072434607645875
Test set report:              precision    recall  f1-score   support

           0       0.84      0.63      0.72       282
           1       0.82      0.86      0.84      1745
           2       0.69      0.64      0.66       555
           3       0.84      0.86      0.85      1655
           4       0.85      0.77      0.

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Define the hyperparameter search space
param_dist = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8, 12],
    #'max_leaf_nodes': [None, 10, 20]
}

# Define the number of random iterations
n_iter = 5  # Adjust this as needed

# Perform the random search
random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=n_iter,
    scoring='accuracy',  # Use the appropriate scoring metric
    cv=5,  # Number of cross-validation folds
    n_jobs=-1  # Use all available CPU cores for parallel processing
)

# Fit the random search on your training data
random_search.fit(X_train, y_train)

# Get the best estimator (model) with the best hyperparameters
best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Make predictions on the training data
y_train_pred = best_model.predict(X_train)

# Calculate accuracy for the training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_pred)

# Generate classification reports for the training and test sets
report_train = classification_report(y_train, y_train_pred)
report_test = classification_report(y_test, y_pred)

# Print the results
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Accuracy: ", random_search.best_score_)
print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set report:\n{report_train}")
print("------------------------------------")
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set report:\n{report_test}")


Best Hyperparameters:  {'n_estimators': 100, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_depth': None}
Best Accuracy:  0.8049295774647888
Train set Accuracy: 0.8911468812877263
Train set report:
              precision    recall  f1-score   support

           0       0.94      0.75      0.84      1059
           1       0.89      0.92      0.90      7084
           2       0.83      0.79      0.81      2226
           3       0.90      0.93      0.91      6569
           4       0.94      0.88      0.91      1067
           5       0.87      0.87      0.87      1875

    accuracy                           0.89     19880
   macro avg       0.90      0.86      0.87     19880
weighted avg       0.89      0.89      0.89     19880

------------------------------------
Test set Accuracy: 0.8106639839034205
Test set report:
              precision    recall  f1-score   support

           0       0.87      0.61      0.71       282
           1       0.81      0.86      0.84      174

In [18]:
# Using certain columns only for the features

X = X[['PM2.5','PM10','NO','NO2','CO','NOx','SO2']]
# Apply a log transformation to your features (X)
X_log_transformed = np.log1p(X)
# Split the log-transformed data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X_log_transformed, y, test_size=0.2, random_state=42)

# Define the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Define the hyperparameter search space
param_dist = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8, 12],
    #'max_leaf_nodes': [None, 10, 20]
}

# Define the number of random iterations
n_iter = 5  # Adjust this as needed

# Perform the random search
random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=n_iter,
    scoring='accuracy',  # Use the appropriate scoring metric
    cv=5,  # Number of cross-validation folds
    n_jobs=-1  # Use all available CPU cores for parallel processing
)

# Fit the random search on your training data
random_search.fit(X_train, y_train)

# Get the best estimator (model) with the best hyperparameters
best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Make predictions on the training data
y_train_pred = best_model.predict(X_train)

# Calculate accuracy for the training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_pred)

# Generate classification reports for the training and test sets
report_train = classification_report(y_train, y_train_pred)
report_test = classification_report(y_test, y_pred)

# Print the results
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Accuracy: ", random_search.best_score_)
print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set report:\n{report_train}")
print("------------------------------------")
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set report:\n{report_test}")


Best Hyperparameters:  {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None}
Best Accuracy:  0.7883299798792758
Train set Accuracy: 0.9554325955734406
Train set report:
              precision    recall  f1-score   support

           0       0.98      0.88      0.93      1059
           1       0.95      0.96      0.96      7084
           2       0.95      0.90      0.93      2226
           3       0.95      0.97      0.96      6569
           4       0.97      0.96      0.97      1067
           5       0.95      0.96      0.95      1875

    accuracy                           0.96     19880
   macro avg       0.96      0.94      0.95     19880
weighted avg       0.96      0.96      0.96     19880

------------------------------------
Test set Accuracy: 0.7917505030181087
Test set report:
              precision    recall  f1-score   support

           0       0.80      0.61      0.69       282
           1       0.81      0.82      0.82      1745