<a href="https://colab.research.google.com/github/Medynal/Pollution/blob/main/pollution_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

repo_url = "https://github.com/Medynal/Pollution.git"
folder_path = "/content/Pollution"

if not os.path.exists(folder_path):
  !git clone {repo_url} {folder_path}
print(f"Current working directory: {os.getcwd()}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report, confusion_matrix
import joblib

In [None]:
!pip show scikit-learn

In [None]:
cpollution_df = pd.read_csv('/content/Pollution/IPNBS/cleaned_pollution_dataset.csv', parse_dates=['Date']).drop(['Month name', 'Date'], axis= 1)
cpollution_df.head()

In [None]:
#Cyclical Transformation
# Month
cpollution_df['month_sin'] = np.sin(2 * np.pi * cpollution_df['month']/12)
cpollution_df['month_cos'] = np.cos(2 * np.pi * cpollution_df['month']/12)
# Day
cpollution_df['day_sin'] = np.sin(2 * np.pi * cpollution_df['day']/7)
cpollution_df['day_cos'] = np.cos(2 * np.pi * cpollution_df['day']/7)

#encoding AQI bucket
bucket_encoder = LabelEncoder()
cpollution_df['AQI_Bucket_encoded'] = bucket_encoder.fit_transform(cpollution_df['AQI_Bucket'])

In [None]:
cpollution_df.head()

In [None]:
pollutants = ['PM2.5','PM10','NO','NO2','NOx','NH3','CO','SO2','O3','Benzene','Toluene','Xylene']
labels = ['City','year','month_sin','month_cos', 'day_sin', 'day_cos', 'AQI'] + pollutants
X = cpollution_df[labels]
target_AQIB= cpollution_df['AQI_Bucket_encoded']
X_= cpollution_df[labels].drop('AQI', axis=1)
target_AQI= cpollution_df['AQI']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split( X, target_AQIB, test_size=0.2, random_state=42)
Xb_train, Xb_test, yb_train, yb_test = train_test_split( X, target_AQI, test_size=0.2, random_state=42)

In [None]:
preprocessor = ColumnTransformer(transformers=[('city', OneHotEncoder(handle_unknown='ignore'), ['City']),
        ('others', 'passthrough', ['year','month_sin','month_cos', 'day_sin', 'day_cos','AQI'] + pollutants)])

#classification pipeline to predict AQI_Bucket
AQIB_model = Pipeline([('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42))])

AQIB_model.fit(X_train, y_train)
y_pred = AQIB_model.predict(X_test)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

In [None]:
preprocessor1 = ColumnTransformer(transformers=[('city', OneHotEncoder(handle_unknown='ignore'), ['City']),
        ('others', 'passthrough', ['year','month_sin','month_cos', 'day_sin', 'day_cos'] + pollutants)])
AQI_pipeline = Pipeline([('preprocessor1', preprocessor),('rfr', RandomForestRegressor(random_state=42))])

from sklearn.model_selection import GridSearchCV
param_grid = {
    'rfr__n_estimators': [100, 200, 500],
    'rfr__max_depth': [None, 10, 20, 30],
    'rfr__min_samples_split': [2, 5, 10]}

AQI_grid = GridSearchCV(
    estimator=AQI_pipeline,
    param_grid= param_grid,
    cv=3,
    scoring= 'neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2)

AQI_grid.fit(Xb_train, yb_train)
print(f"Best parameters: {AQI_grid.best_params_}")
print(f"Best score: {AQI_grid.best_score_}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
best_AQI_model = AQI_grid.best_estimator_

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred = AQI_grid.predict(Xb_test)
rmse = np.sqrt(mean_squared_error(yb_test, y_pred))
rmse


In [None]:
os.makedirs('models', exist_ok=True)
joblib.dump(AQIB_model, 'models/aqib_classifier.pkl')
joblib.dump(best_AQI_model, 'models/aqi_regressor.pkl')
joblib.dump(bucket_encoder, 'models/bucket_encoder.pkl')
print('Models saved in models/ folder')