<a href="https://colab.research.google.com/github/Medynal/Pollution/blob/main/pollution_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

repo_url = "https://github.com/Medynal/Pollution.git"
folder_path = "/content/Pollution"

if not os.path.exists(folder_path):
  !git clone {repo_url} {folder_path}
print(f"Current working directory: {os.getcwd()}")


Cloning into '/content/Pollution'...
remote: Enumerating objects: 150, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 150 (delta 30), reused 5 (delta 5), pack-reused 118 (from 1)[K
Receiving objects: 100% (150/150), 3.71 MiB | 10.15 MiB/s, done.
Resolving deltas: 100% (58/58), done.
Current working directory: /content


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report, confusion_matrix
import joblib

In [3]:
cpollution_df = pd.read_csv('/content/Pollution/cleaned_pollution_dataset.csv', parse_dates=['Date']).drop(['Month name', 'Date'], axis= 1)
cpollution_df.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket,year,month,day
0,Ahmedabad,73.24,141.54,0.92,18.22,17.15,26.64,0.92,27.64,133.36,0.0,0.02,0.0,209.0,Poor,2015,1,1
1,Ahmedabad,73.24,141.54,0.97,15.69,16.46,26.64,0.97,24.55,34.06,3.68,5.5,3.77,209.0,Poor,2015,1,2
2,Ahmedabad,73.24,141.54,17.4,19.3,29.7,26.64,17.4,29.07,30.7,6.8,16.4,2.25,209.0,Poor,2015,1,3
3,Ahmedabad,73.24,141.54,1.7,18.48,17.97,26.64,1.7,18.59,36.08,4.43,10.14,1.0,209.0,Poor,2015,1,4
4,Ahmedabad,73.24,141.54,22.1,21.42,37.76,26.64,22.1,39.33,39.31,7.01,18.89,2.78,209.0,Poor,2015,1,5


In [4]:
#Cyclical Transformation
# Month
cpollution_df['month_sin'] = np.sin(2 * np.pi * cpollution_df['month']/12)
cpollution_df['month_cos'] = np.cos(2 * np.pi * cpollution_df['month']/12)
# Day
cpollution_df['day_sin'] = np.sin(2 * np.pi * cpollution_df['day']/7)
cpollution_df['day_cos'] = np.cos(2 * np.pi * cpollution_df['day']/7)

#encoding AQI bucket
bucket_encoder = LabelEncoder()
cpollution_df['AQI_Bucket_encoded'] = bucket_encoder.fit_transform(cpollution_df['AQI_Bucket'])

In [5]:
cpollution_df.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,...,AQI,AQI_Bucket,year,month,day,month_sin,month_cos,day_sin,day_cos,AQI_Bucket_encoded
0,Ahmedabad,73.24,141.54,0.92,18.22,17.15,26.64,0.92,27.64,133.36,...,209.0,Poor,2015,1,1,0.5,0.866025,0.781831,0.62349,2
1,Ahmedabad,73.24,141.54,0.97,15.69,16.46,26.64,0.97,24.55,34.06,...,209.0,Poor,2015,1,2,0.5,0.866025,0.974928,-0.222521,2
2,Ahmedabad,73.24,141.54,17.4,19.3,29.7,26.64,17.4,29.07,30.7,...,209.0,Poor,2015,1,3,0.5,0.866025,0.433884,-0.900969,2
3,Ahmedabad,73.24,141.54,1.7,18.48,17.97,26.64,1.7,18.59,36.08,...,209.0,Poor,2015,1,4,0.5,0.866025,-0.433884,-0.900969,2
4,Ahmedabad,73.24,141.54,22.1,21.42,37.76,26.64,22.1,39.33,39.31,...,209.0,Poor,2015,1,5,0.5,0.866025,-0.974928,-0.222521,2


In [24]:
pollutants = ['PM2.5','PM10','NO','NO2','NOx','NH3','CO','SO2','O3','Benzene','Toluene','Xylene']
labels = ['City','year','month_sin','month_cos', 'day_sin', 'day_cos', 'AQI'] + pollutants
X = cpollution_df[labels]
target_AQIB= cpollution_df['AQI_Bucket_encoded']

In [28]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split( X, target_AQIB, test_size=0.2, random_state=42)

In [29]:
preprocessor = ColumnTransformer(transformers=[('city', OneHotEncoder(handle_unknown='ignore'), ['City']),
        ('others', 'passthrough', ['year','month_sin','month_cos', 'day_sin', 'day_cos','AQI'] + pollutants)])

In [12]:
reg = RandomForestRegressor()
reg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [31]:
AQIB_pipeline = Pipeline([('preprocessor', preprocessor),
    ('rfc', RandomForestClassifier(random_state=42))])

In [34]:
#classification pipeline to predict AQI_Bucket
clf_pipeline = Pipeline([('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42))])

clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

Classification accuracy: 0.9964448958862366
Classification report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       296
           1       1.00      1.00      1.00      2062
           2       0.99      0.99      0.99       666
           3       1.00      1.00      1.00      2033
           4       0.99      0.97      0.98       302
           5       0.98      0.99      0.99       548

    accuracy                           1.00      5907
   macro avg       0.99      0.99      0.99      5907
weighted avg       1.00      1.00      1.00      5907

Confusion matrix:
 [[ 293    0    0    3    0    0]
 [   0 2062    0    0    0    0]
 [   0    2  662    0    0    2]
 [   0    0    0 2033    0    0]
 [   0    1    1    0  293    7]
 [   0    0    3    0    2  543]]


In [33]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'rfc__n_estimators': [100, 200, 500],
    'rfc__max_depth': [None, 10, 20, 30],
    'rfc__min_samples_split': [2, 5, 10]}

AQIB_grid = GridSearchCV(
    estimator=AQIB_pipeline,
    param_grid= param_grid,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=2)

AQIB_grid.fit(X_train, y_train)
print(f"Best parameters: {AQIB_grid.best_params_}")
print(f"Best score: {AQIB_grid.best_score_}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters: {'rfc__max_depth': 30, 'rfc__min_samples_split': 5, 'rfc__n_estimators': 200}
Best score: -0.02065704690139539


In [37]:
best_model = AQIB_grid.best_estimator_
y_pred = best_model.predict(X_test)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

Classification accuracy: 0.9964448958862366
Classification report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       296
           1       1.00      1.00      1.00      2062
           2       0.99      0.99      0.99       666
           3       1.00      1.00      1.00      2033
           4       1.00      0.97      0.98       302
           5       0.99      0.99      0.99       548

    accuracy                           1.00      5907
   macro avg       1.00      0.99      0.99      5907
weighted avg       1.00      1.00      1.00      5907

Confusion matrix:
 [[ 293    0    0    3    0    0]
 [   0 2062    0    0    0    0]
 [   0    2  662    0    0    2]
 [   0    0    0 2033    0    0]
 [   0    1    1    0  294    6]
 [   0    2    3    0    1  542]]


In [38]:
os.makedirs('models', exist_ok=True)
joblib.dump(best_model, 'models/aqi_classifier.pkl')
joblib.dump(bucket_encoder, 'models/bucket_encoder.pkl')
print('Models saved in models/ folder')

Models saved in models/ folder
