<a href="https://colab.research.google.com/github/Medynal/Pollution/blob/main/pollution_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

repo_url = "https://github.com/Medynal/Pollution.git"
folder_path = "/content/Pollution"

if not os.path.exists(folder_path):
  !git clone {repo_url} {folder_path}
print(f"Current working directory: {os.getcwd()}")


Current working directory: /content


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report, confusion_matrix
import joblib

In [3]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.6.1
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License

 Copyright (c) 2007-2024 The scikit-learn developers.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYR

In [4]:
cpollution_df = pd.read_csv('/content/Pollution/IPNBS/cleaned_pollution_dataset.csv', parse_dates=['Date']).drop(['Month name', 'Date'], axis= 1)
cpollution_df.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket,year,month,day
0,Ahmedabad,73.24,141.54,0.92,18.22,17.15,26.64,0.92,27.64,133.36,0.0,0.02,0.0,209.0,Poor,2015,1,1
1,Ahmedabad,73.24,141.54,0.97,15.69,16.46,26.64,0.97,24.55,34.06,3.68,5.5,3.77,209.0,Poor,2015,1,2
2,Ahmedabad,73.24,141.54,17.4,19.3,29.7,26.64,17.4,29.07,30.7,6.8,16.4,2.25,209.0,Poor,2015,1,3
3,Ahmedabad,73.24,141.54,1.7,18.48,17.97,26.64,1.7,18.59,36.08,4.43,10.14,1.0,209.0,Poor,2015,1,4
4,Ahmedabad,73.24,141.54,22.1,21.42,37.76,26.64,22.1,39.33,39.31,7.01,18.89,2.78,209.0,Poor,2015,1,5


In [5]:
#Cyclical Transformation
# Month
cpollution_df['month_sin'] = np.sin(2 * np.pi * cpollution_df['month']/12)
cpollution_df['month_cos'] = np.cos(2 * np.pi * cpollution_df['month']/12)
# Day
cpollution_df['day_sin'] = np.sin(2 * np.pi * cpollution_df['day']/7)
cpollution_df['day_cos'] = np.cos(2 * np.pi * cpollution_df['day']/7)

#encoding AQI bucket
bucket_encoder = LabelEncoder()
cpollution_df['AQI_Bucket_encoded'] = bucket_encoder.fit_transform(cpollution_df['AQI_Bucket'])

In [6]:
cpollution_df.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,...,AQI,AQI_Bucket,year,month,day,month_sin,month_cos,day_sin,day_cos,AQI_Bucket_encoded
0,Ahmedabad,73.24,141.54,0.92,18.22,17.15,26.64,0.92,27.64,133.36,...,209.0,Poor,2015,1,1,0.5,0.866025,0.781831,0.62349,2
1,Ahmedabad,73.24,141.54,0.97,15.69,16.46,26.64,0.97,24.55,34.06,...,209.0,Poor,2015,1,2,0.5,0.866025,0.974928,-0.222521,2
2,Ahmedabad,73.24,141.54,17.4,19.3,29.7,26.64,17.4,29.07,30.7,...,209.0,Poor,2015,1,3,0.5,0.866025,0.433884,-0.900969,2
3,Ahmedabad,73.24,141.54,1.7,18.48,17.97,26.64,1.7,18.59,36.08,...,209.0,Poor,2015,1,4,0.5,0.866025,-0.433884,-0.900969,2
4,Ahmedabad,73.24,141.54,22.1,21.42,37.76,26.64,22.1,39.33,39.31,...,209.0,Poor,2015,1,5,0.5,0.866025,-0.974928,-0.222521,2


In [7]:
pollutants = ['PM2.5','PM10','NO','NO2','NOx','NH3','CO','SO2','O3','Benzene','Toluene','Xylene']
labels = ['City','year','month_sin','month_cos', 'day_sin', 'day_cos', 'AQI'] + pollutants
X = cpollution_df[labels]
target_AQIB= cpollution_df['AQI_Bucket_encoded']
X_= cpollution_df[labels].drop('AQI', axis=1)
target_AQI= cpollution_df['AQI']

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split( X, target_AQIB, test_size=0.2, random_state=42)
Xb_train, Xb_test, yb_train, yb_test = train_test_split( X, target_AQI, test_size=0.2, random_state=42)

In [9]:
preprocessor = ColumnTransformer(transformers=[('city', OneHotEncoder(handle_unknown='ignore'), ['City']),
        ('others', 'passthrough', ['year','month_sin','month_cos', 'day_sin', 'day_cos','AQI'] + pollutants)])

#classification pipeline to predict AQI_Bucket
AQIB_model = Pipeline([('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42))])

AQIB_model.fit(X_train, y_train)
y_pred = AQIB_model.predict(X_test)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

Classification accuracy: 0.9964448958862366
Classification report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       296
           1       1.00      1.00      1.00      2062
           2       0.99      0.99      0.99       666
           3       1.00      1.00      1.00      2033
           4       0.99      0.97      0.98       302
           5       0.98      0.99      0.99       548

    accuracy                           1.00      5907
   macro avg       0.99      0.99      0.99      5907
weighted avg       1.00      1.00      1.00      5907

Confusion matrix:
 [[ 293    0    0    3    0    0]
 [   0 2062    0    0    0    0]
 [   0    2  662    0    0    2]
 [   0    0    0 2033    0    0]
 [   0    1    1    0  293    7]
 [   0    0    3    0    2  543]]


In [None]:
preprocessor1 = ColumnTransformer(transformers=[('city', OneHotEncoder(handle_unknown='ignore'), ['City']),
        ('others', 'passthrough', ['year','month_sin','month_cos', 'day_sin', 'day_cos'] + pollutants)])
AQI_pipeline = Pipeline([('preprocessor1', preprocessor),('rfr', RandomForestRegressor(random_state=42))])

from sklearn.model_selection import GridSearchCV
param_grid = {
    'rfr__n_estimators': [100, 200, 500],
    'rfr__max_depth': [None, 10, 20, 30],
    'rfr__min_samples_split': [2, 5, 10]}

AQI_grid = GridSearchCV(
    estimator=AQI_pipeline,
    param_grid= param_grid,
    cv=3,
    scoring= 'neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2)

AQI_grid.fit(Xb_train, yb_train)
print(f"Best parameters: {AQI_grid.best_params_}")
print(f"Best score: {AQI_grid.best_score_}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
best_AQI_model = AQI_grid.best_estimator_

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred = AQI_grid.predict(Xb_test)
rmse = np.sqrt(mean_squared_error(yb_test, y_pred))
rmse


In [None]:
os.makedirs('models', exist_ok=True)
joblib.dump(AQIB_model, 'models/aqib_classifier.pkl')
joblib.dump(best_AQI_model, 'models/aqi_regressor.pkl')
joblib.dump(bucket_encoder, 'models/bucket_encoder.pkl')
print('Models saved in models/ folder')