# Importing Importance libraries

In [1]:
# Import the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
train_data = pd.read_csv("train_dataset.csv")
train_data.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,...,Jorapokhar,Kochi,Kolkata,Lucknow,Mumbai,Patna,Shillong,Talcher,Thiruvananthapuram,Visakhapatnam
0,78.2,90.85,1.37,40.01,36.37,19.52,1.0,8.42,37.71,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,78.2,90.85,1.37,27.75,19.73,19.52,0.02,8.42,37.71,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,78.2,90.85,1.37,19.32,11.08,19.52,0.08,8.42,37.71,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,78.2,90.85,1.37,16.45,9.2,19.52,0.3,8.42,25.78,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,78.2,90.85,1.37,14.9,7.85,19.52,0.12,8.42,37.71,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
train_data.columns

Index(['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
       'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket', 'Year', 'Month',
       'Ahmedabad', 'Aizawl', 'Amaravati', 'Amritsar', 'Bengaluru', 'Bhopal',
       'Brajrajnagar', 'Chandigarh', 'Chennai', 'Coimbatore', 'Delhi',
       'Ernakulam', 'Gurugram', 'Guwahati', 'Hyderabad', 'Jaipur',
       'Jorapokhar', 'Kochi', 'Kolkata', 'Lucknow', 'Mumbai', 'Patna',
       'Shillong', 'Talcher', 'Thiruvananthapuram', 'Visakhapatnam'],
      dtype='object')

# Features Seleciton

In [4]:
train_dataset = train_data.drop(columns=["AQI","AQI_Bucket"])

In [5]:
X = train_dataset
y = train_data[["AQI","AQI_Bucket"]]

# For Linear regression target variable is "AQI"

### Features Seleciton for linear regression

In [6]:
X = X
y = y["AQI"]

# Train Test Spilt

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [8]:
X_train.shape, y_train.shape

((401274, 40), (401274,))

In [9]:
X_test.shape, y_test.shape

((267517, 40), (267517,))

# Modeling: 

## 1. Linear Regression

In [10]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [11]:
m = lr_model.coef_
c = lr_model.intercept_
print(f"coef : {m}")
print(f"intercept: {c}")

coef : [ 9.16168305e-01  1.12083057e-01  8.21305482e-02  2.75559016e-01
  3.55568097e-01 -2.77238629e-01  1.29535904e+01  9.53577329e-01
  2.04882793e-01 -2.55231793e-02  3.94388835e-01 -9.94759830e-14
 -1.01077105e+01 -7.39901159e-01  5.02991225e+01 -2.21657287e+01
 -2.30544028e+01 -1.01886506e+01 -3.41781660e+01  1.70773395e+00
  1.12895711e+01 -8.96177947e+00 -1.74842496e+01 -2.55125180e+01
  3.40089468e+01  3.20975901e+00  4.54323786e+01  1.27595986e+01
 -2.75688972e+01 -1.46369383e+01  7.91435058e+00 -4.28199333e+00
  2.79594512e+00  3.97119999e+01 -2.48418662e+01  5.32474504e+01
 -1.79453847e+01  1.74244201e+01 -2.59414224e+01 -2.30392794e+01]
intercept: 20445.06629024741


# Evaluation

In [12]:
lr_y_pred = lr_model.predict(X_test)

In [13]:
from sklearn.metrics import r2_score
lr_r2 = r2_score(y_true=y_test, y_pred=lr_y_pred)
print(f"Linear Regression R2 score is {round(lr_r2, 3) * 100}")

Linear Regression R2 score is 46.800000000000004


# 2. Lasso Regression

In [14]:

from sklearn.linear_model import Lasso
la_r_model = Lasso()
la_r_model.fit(X_train, y_train)

In [15]:
la_r_y_pred = la_r_model.predict(X_test)
la_r_r2 = r2_score(y_true=y_test, y_pred=la_r_y_pred)
print(f"Lasso Regression R2 score is {round(la_r_r2,3) * 100}")

Lasso Regression R2 score is 43.9


# 3. Ridge Regression

In [16]:

from sklearn.linear_model import Ridge
rr_model = Ridge()
rr_model.fit(X_train, y_train)
     

In [17]:

rr_y_pred = rr_model.predict(X_test)
rr_r2 = r2_score(y_true=y_test, y_pred=rr_y_pred)
print(f"Ridge Regression R2 score is {round(rr_r2,3) * 100}")

Ridge Regression R2 score is 46.800000000000004


# 4. Random Forest Regression

In [18]:
from sklearn.ensemble import RandomForestRegressor
random_forest_model = RandomForestRegressor(max_depth=2, random_state=42)

In [19]:
random_forest_model.fit(X_train, y_train)

In [20]:
rfr_y_pred = random_forest_model.predict(X_test)
rfr_r2 = r2_score(y_true=y_test, y_pred=rfr_y_pred)
print(f"Random Forest Regression R2 score is {round(rfr_r2,3) * 100}")

Random Forest Regression R2 score is 36.9


## Hyperparameter Tuning

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 6)]
max_depth = [int(x) for x in np.linspace(start = 5, stop = 30, num = 4)]

In [23]:
random_grid = {
    'n_estimators' : n_estimators,
    'max_features': ['auto', 'sqrt'],
    'max_depth' : max_depth,
    'min_samples_split': [5, 10, 10, 100]
}

rf_random = RandomizedSearchCV(estimator = random_forest_model, param_distributions = random_grid, cv = 3, verbose = 2,
                  n_jobs = -1 )

# 5. Decision Tree Regresssion

In [24]:
from sklearn.tree import DecisionTreeRegressor
dtr_model = DecisionTreeRegressor()
dtr_model.fit(X_train, y_train)

In [26]:
dtr_y_pred = dtr_model.predict(X_test)
dtr_r2 = r2_score(y_true=y_test, y_pred=dtr_y_pred)
print(f"Decision Tree Regression R2 score is {round(dtr_r2,3) * 100}")

Decision Tree Regression R2 score is 52.7
