### Imports

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('processed_data/AQI_data_imputed.csv')

In [4]:
df.isna().sum()

City          0
Date          0
PM2.5         0
PM10          0
NO            0
NO2           0
NOx           0
NH3           0
CO            0
SO2           0
O3            0
Benzene       0
Toluene       0
Xylene        0
AQI           0
AQI_Bucket    0
dtype: int64

### BaseLine Model

In [5]:
df.head()
df_baseline = df.copy()

In [14]:
df_baseline

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket,Predicted_AQI_Bucket
0,Ahmedabad,2015-01-29,83.13,96.18,6.93,28.71,33.72,16.31,6.93,49.52,59.76,0.02,0.000,3.14,209.0,Poor,Poor
1,Ahmedabad,2015-01-30,79.84,96.18,13.85,28.68,41.08,16.31,13.85,48.49,97.07,0.04,0.000,4.81,328.0,Very Poor,Poor
2,Ahmedabad,2015-01-31,94.52,96.18,24.39,32.66,52.61,16.31,24.39,67.39,111.33,0.24,0.010,7.67,514.0,Severe,Poor
3,Ahmedabad,2015-02-01,135.99,96.18,43.48,42.08,84.57,16.31,43.48,75.23,102.70,0.40,0.040,25.87,782.0,Severe,Poor
4,Ahmedabad,2015-02-02,178.33,96.18,54.56,35.31,72.80,16.31,54.56,55.04,107.38,0.46,0.060,35.61,914.0,Severe,Very Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24845,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.070,0.73,41.0,Good,Satisfactory
24846,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.210,0.38,70.0,Satisfactory,Satisfactory
24847,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.010,0.00,68.0,Satisfactory,Satisfactory
24848,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.000,0.00,54.0,Satisfactory,Satisfactory


In [16]:
# A simple rule-based model based on PM values
def simple_pm_rule_based_aqi_predictor(df_baseline):
    aqi_buckets = []  # This list will store the predicted AQI_Buckets for each row

    for index, row in df_baseline.iterrows():
        pm25 = row['PM2.5']

        # Rules based on PM25 values 
        if pm25 <= 12:
            aqi_buckets.append('Good')
        elif pm25 <= 35:
            aqi_buckets.append('Satisfactory')
        elif pm25 <= 55:
            aqi_buckets.append('Moderate')
        elif pm25 <= 115:
            aqi_buckets.append('Poor')
        elif pm25 <= 250:
            aqi_buckets.append('Very Poor')
        else:
            aqi_buckets.append('Severe')
    return aqi_buckets

# Use the simple rule-based model to predict AQI_Buckets for your DataFrame 'df_baseline'
df_baseline['Predicted_AQI_Bucket'] = simple_pm_rule_based_aqi_predictor(df_baseline)

# View the DataFrame with the predicted AQI_Buckets
print(df_baseline[['City', 'Date', 'AQI_Bucket', 'Predicted_AQI_Bucket']])


                City        Date    AQI_Bucket Predicted_AQI_Bucket
0          Ahmedabad  2015-01-29          Poor                 Poor
1          Ahmedabad  2015-01-30     Very Poor                 Poor
2          Ahmedabad  2015-01-31        Severe                 Poor
3          Ahmedabad  2015-02-01        Severe            Very Poor
4          Ahmedabad  2015-02-02        Severe            Very Poor
...              ...         ...           ...                  ...
24845  Visakhapatnam  2020-06-27          Good         Satisfactory
24846  Visakhapatnam  2020-06-28  Satisfactory         Satisfactory
24847  Visakhapatnam  2020-06-29  Satisfactory         Satisfactory
24848  Visakhapatnam  2020-06-30  Satisfactory         Satisfactory
24849  Visakhapatnam  2020-07-01          Good         Satisfactory

[24850 rows x 4 columns]


In [17]:
# True labels (actual AQI_Bucket)
true_labels = df_baseline['AQI_Bucket']
# Predicted labels
predicted_labels = df_baseline['Predicted_AQI_Bucket']

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

# Calculate precision, recall, and F1-score
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Generate a classification report
classification_rep = classification_report(true_labels, predicted_labels)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print("Classification Report:\n", classification_rep)


Accuracy: 0.5021730382293763
Precision: 0.560818947510275
Recall: 0.5021730382293763
F1-Score: 0.5092112877600941
Classification Report:
               precision    recall  f1-score   support

        Good       0.46      0.31      0.37      1341
    Moderate       0.49      0.35      0.41      8829
        Poor       0.25      0.65      0.36      2781
Satisfactory       0.69      0.59      0.64      8224
      Severe       0.83      0.34      0.48      1338
   Very Poor       0.65      0.79      0.71      2337

    accuracy                           0.50     24850
   macro avg       0.56      0.50      0.50     24850
weighted avg       0.56      0.50      0.51     24850



### Logistic regression for Multiclassification

In [36]:
df.head()
df_log = df.copy()

In [37]:
df_log

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-29,83.13,96.18,6.93,28.71,33.72,16.31,6.93,49.52,59.76,0.02,0.000,3.14,209.0,Poor
1,Ahmedabad,2015-01-30,79.84,96.18,13.85,28.68,41.08,16.31,13.85,48.49,97.07,0.04,0.000,4.81,328.0,Very Poor
2,Ahmedabad,2015-01-31,94.52,96.18,24.39,32.66,52.61,16.31,24.39,67.39,111.33,0.24,0.010,7.67,514.0,Severe
3,Ahmedabad,2015-02-01,135.99,96.18,43.48,42.08,84.57,16.31,43.48,75.23,102.70,0.40,0.040,25.87,782.0,Severe
4,Ahmedabad,2015-02-02,178.33,96.18,54.56,35.31,72.80,16.31,54.56,55.04,107.38,0.46,0.060,35.61,914.0,Severe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24845,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.070,0.73,41.0,Good
24846,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.210,0.38,70.0,Satisfactory
24847,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.010,0.00,68.0,Satisfactory
24848,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.000,0.00,54.0,Satisfactory


In [38]:
from sklearn.linear_model import LogisticRegression


# Drop the 'City', 'Date', and 'AQI' columns
df_log = df_log.drop(columns=['City', 'Date', 'AQI'])

# Define the features (X) and the target variable (y)
X = df_log.drop(columns=['AQI_Bucket'])  # Features
y = df_log['AQI_Bucket']  # Target variable

# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Logistic Regression model for multi-class classification
model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='newton-cg')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)
# Prediction on training data
y_pred_train = model.predict(X_train)

# Calculate accuracy of training set
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy for training set: {accuracy_train}")
print(classification_report(y_train, y_pred_train))
print("----------------------------------------------------------")
# Evaluate the model's performance using accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for testing set: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy for training set: 0.754225352112676
              precision    recall  f1-score   support

        Good       0.75      0.42      0.54      1059
    Moderate       0.76      0.81      0.79      7084
        Poor       0.66      0.55      0.60      2226
Satisfactory       0.77      0.83      0.80      6569
      Severe       0.85      0.72      0.78      1067
   Very Poor       0.72      0.74      0.73      1875

    accuracy                           0.75     19880
   macro avg       0.75      0.68      0.71     19880
weighted avg       0.75      0.75      0.75     19880

----------------------------------------------------------
Accuracy for testing set: 0.7476861167002012
              precision    recall  f1-score   support

        Good       0.73      0.38      0.50       282
    Moderate       0.75      0.82      0.78      1745
        Poor       0.63      0.51      0.56       555
Satisfactory       0.77      0.83      0.80      1655
      Severe       0.85      0.72    