# Data preprocessing and Feature selection

In [1]:
# Import the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact


In [2]:
# Specify the absolute file path
file_path = "dataset/air_quality_data.csv"

# importing dataset and converting data into pandas dataframe
df = pd.read_csv(file_path)

In [3]:
df.head(3)

Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01 01:00:00,,,1.0,40.01,36.37,,1.0,122.07,,0.0,0.0,0.0,,
1,Ahmedabad,2015-01-01 02:00:00,,,0.02,27.75,19.73,,0.02,85.9,,0.0,0.0,0.0,,
2,Ahmedabad,2015-01-01 03:00:00,,,0.08,19.32,11.08,,0.08,52.83,,0.0,0.0,0.0,,


# Data Preprocessing

In [4]:
data1 = df.copy()

In [5]:
# Replace null values with mean
data1['PM2.5']=data1['PM2.5'].fillna((data1['PM2.5'].mean()))
data1['PM10']=data1['PM10'].fillna((data1['PM10'].mean()))
data1['NO']=data1['NO'].fillna((data1['NO'].mean()))
data1['NO2']=data1['NO2'].fillna((data1['NO2'].mean()))
data1['NOx']=data1['NOx'].fillna((data1['NOx'].mean()))
data1['NH3']=data1['NH3'].fillna((data1['NH3'].mean()))
data1['CO']=data1['CO'].fillna((data1['CO'].mean()))
data1['SO2']=data1['SO2'].fillna((data1['SO2'].mean()))
data1['O3']=data1['O3'].fillna((data1['O3'].mean()))
data1['Benzene']=data1['Benzene'].fillna((data1['Benzene'].mean()))
data1['Toluene']=data1['Toluene'].fillna((data1['Toluene'].mean()))
data1['Xylene']=data1['Xylene'].fillna((data1['Xylene'].mean()))
data1['AQI']=data1['AQI'].fillna((data1['AQI'].mean()))
data1['AQI_Bucket']=data1['AQI_Bucket'].fillna('Moderate')

In [6]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737406 entries, 0 to 737405
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   City        737406 non-null  object 
 1   Datetime    737406 non-null  object 
 2   PM2.5       737406 non-null  float64
 3   PM10        737406 non-null  float64
 4   NO          737406 non-null  float64
 5   NO2         737406 non-null  float64
 6   NOx         737406 non-null  float64
 7   NH3         737406 non-null  float64
 8   CO          737406 non-null  float64
 9   SO2         737406 non-null  float64
 10  O3          737406 non-null  float64
 11  Benzene     737406 non-null  float64
 12  Toluene     737406 non-null  float64
 13  Xylene      737406 non-null  float64
 14  AQI         737406 non-null  float64
 15  AQI_Bucket  737406 non-null  object 
dtypes: float64(13), object(3)
memory usage: 90.0+ MB


In [7]:
city = pd.get_dummies(data1["City"], dtype=float)

In [8]:
city

Unnamed: 0,Ahmedabad,Aizawl,Amaravati,Amritsar,Bengaluru,Bhopal,Brajrajnagar,Chandigarh,Chennai,Coimbatore,...,Jorapokhar,Kochi,Kolkata,Lucknow,Mumbai,Patna,Shillong,Talcher,Thiruvananthapuram,Visakhapatnam
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
data2 = pd.concat([data1, city], axis=1)

In [10]:
data2

Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,...,Jorapokhar,Kochi,Kolkata,Lucknow,Mumbai,Patna,Shillong,Talcher,Thiruvananthapuram,Visakhapatnam
0,Ahmedabad,2015-01-01 01:00:00,67.615679,119.035184,1.00,40.01,36.37,23.6027,1.00,122.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ahmedabad,2015-01-01 02:00:00,67.615679,119.035184,0.02,27.75,19.73,23.6027,0.02,85.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ahmedabad,2015-01-01 03:00:00,67.615679,119.035184,0.08,19.32,11.08,23.6027,0.08,52.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Ahmedabad,2015-01-01 04:00:00,67.615679,119.035184,0.30,16.45,9.20,23.6027,0.30,39.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Ahmedabad,2015-01-01 05:00:00,67.615679,119.035184,0.12,14.90,7.85,23.6027,0.12,32.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737401,Visakhapatnam,2020-06-27,15.020000,50.940000,7.68,25.06,19.54,12.4700,0.47,8.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737402,Visakhapatnam,2020-06-28,24.380000,74.090000,3.42,26.06,16.53,11.9900,0.52,12.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737403,Visakhapatnam,2020-06-29,22.910000,65.730000,3.45,29.53,18.33,10.7100,0.48,8.42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737404,Visakhapatnam,2020-06-30,16.640000,49.970000,4.05,29.26,18.80,10.0300,0.52,9.84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# droping date and city
data3 = data2.drop(columns=["City","Datetime","AQI_Bucket"])

In [12]:
data3

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,...,Jorapokhar,Kochi,Kolkata,Lucknow,Mumbai,Patna,Shillong,Talcher,Thiruvananthapuram,Visakhapatnam
0,67.615679,119.035184,1.00,40.01,36.37,23.6027,1.00,122.07,34.785994,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,67.615679,119.035184,0.02,27.75,19.73,23.6027,0.02,85.90,34.785994,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,67.615679,119.035184,0.08,19.32,11.08,23.6027,0.08,52.83,34.785994,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,67.615679,119.035184,0.30,16.45,9.20,23.6027,0.30,39.53,153.580000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,67.615679,119.035184,0.12,14.90,7.85,23.6027,0.12,32.63,34.785994,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737401,15.020000,50.940000,7.68,25.06,19.54,12.4700,0.47,8.55,23.300000,2.240000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737402,24.380000,74.090000,3.42,26.06,16.53,11.9900,0.52,12.72,30.140000,0.740000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737403,22.910000,65.730000,3.45,29.53,18.33,10.7100,0.48,8.42,30.960000,0.010000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
737404,16.640000,49.970000,4.05,29.26,18.80,10.0300,0.52,9.84,28.300000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Features Selection

In [13]:
X= data3.drop(columns="AQI")
y = data3["AQI"]

# Train test split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling

# 1. Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Evalution

In [16]:
from sklearn.metrics import r2_score
y_pred = lr_model.predict(X_test)
r2 = r2_score(y_true=y_test, y_pred=y_pred)
print(f"Linear Regression R2 score is {round(r2, 3)}")

Linear Regression R2 score is 0.604


# 2. Random Forest Regression

In [17]:
from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(max_depth=2, random_state=42)
rfr_model.fit(X_train, y_train)

In [18]:
from sklearn.metrics import r2_score
y_pred = rfr_model.predict(X_test)
r2 = r2_score(y_true=y_test, y_pred=y_pred)
print(f"R2 score is {round(r2, 3)}")

R2 score is 0.454


## Hyperparameter Tuning

In [19]:
from sklearn.model_selection import RandomizedSearchCV

In [20]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 600, num = 4)]
max_depth = [int(x) for x in np.linspace(start = 5, stop = 20, num = 3)]
random_grid = {
    'n_estimators' : n_estimators,
    'max_features': ['auto', 'sqrt'],
    'max_depth' : max_depth,
    'min_samples_split': [5, 10, 100]
}

In [21]:
rf_random = RandomizedSearchCV(estimator = rfr_model, param_distributions = random_grid, cv = 3, verbose = 2,
                  n_jobs = -1 )

In [22]:
# rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
# from sklearn.tree import DecisionTreeRegressor
# dtc_model = DecisionTreeRegressor()
# dtc_model.fit(X_train, y_train)

# 3. Decision Tree Regresssion

In [23]:
from sklearn.tree import DecisionTreeRegressor
dtc_model = DecisionTreeRegressor()
dtc_model.fit(X_train, y_train)

In [24]:
y_pred = dtc_model.predict(X_test)
r2 = r2_score(y_true=y_test, y_pred=y_pred)
print(f"Decision Tree Regression R2 score is {r2}")

Decision Tree Regression R2 score is 0.5209204052423074
