**AQI Prediction Using Xgboost Regressor**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
with open('/content/drive/MyDrive/Google_colab_project/df.pkl','rb') as file:
    df= pickle.load(file)

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

### **XGBoost Regressor Implementation without Hyper Tuning**

In [None]:
import xgboost as xgb
xgb_regressor=xgb.XGBRegressor()
xgb_regressor.fit(X_train,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:

print("Coefficient of determination R^2 <-- on train set: {}".format(xgb_regressor.score(X_train, y_train)))
print("Coefficient of determination R^2 <-- on train set: {}".format(xgb_regressor.score(X_test, y_test)))

Coefficient of determination R^2 <-- on train set: 0.8686497704805118
Coefficient of determination R^2 <-- on train set: 0.7273893483591731


#### **Model Evaluation**



In [None]:
xgb_prediction=xgb_regressor.predict(X_test)

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, xgb_prediction))
print('MSE:', metrics.mean_squared_error(y_test, xgb_prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, xgb_prediction)))

MAE: 32.64319192953124
MSE: 2047.4328991937127
RMSE: 45.24856792423063


XGBoost Regressor model has  RMSE(Root Mean Square Error) is around 45 without any hyperparameter tuning. So, Next, we will do hyperparameter tuning to getting a lower RMSE value.

### **XGBoost Regressor Implementation with Hyperparameter Tuning- Randomized SerchCV**


#### Randomized SerchCV-1

In [None]:
xgb_regressor_rcv1=xgb.XGBRegressor()

In [None]:
#Randomized Search CV

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = ['0.05','0.1', '0.2','0.3','0.5','0.6']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child weight parameters
min_child_weight=[3,4,5,6,7]

In [None]:

# Create the random grid
random_grid_1 = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_weight': min_child_weight}

print(random_grid_1)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'learning_rate': ['0.05', '0.1', '0.2', '0.3', '0.5', '0.6'], 'max_depth': [5, 10, 15, 20, 25, 30], 'subsample': [0.7, 0.6, 0.8], 'min_child_weight': [3, 4, 5, 6, 7]}


In [None]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations

xgb_rcv1 = RandomizedSearchCV(estimator = xgb_regressor_rcv1, param_distributions = random_grid_1,
                               scoring='neg_mean_squared_error', n_iter = 100, cv = 5, verbose=2, 
                               random_state=42, n_jobs = 1)


In [None]:
xgb_rcv1.fit(X_train,y_train)

In [None]:
xgb_rcv1_prediction=xgb_rcv1.predict(X_test)
print(xgb_rcv1.best_params_)
print(xgb_rcv1.best_score_)
print('MAE:', metrics.mean_absolute_error(y_test,xgb_rcv1_prediction))
print('MSE:', metrics.mean_squared_error(y_test, xgb_rcv1_prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, xgb_rcv1_prediction)))

{'subsample': 0.6, 'n_estimators': 600, 'min_child_weight': 7, 'max_depth': 15, 'learning_rate': '0.1'}
-1197.5530091766896
MAE: 18.987951824164284
MSE: 1216.0797739353566
RMSE: 34.87233536681128


#### Randomized SerchCV-2

In [None]:
xgb_regressor_rcv2=xgb.XGBRegressor()

In [None]:
#Randomized Search CV

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 400, stop = 800, num = 10)]
# Various learning rate parameters
learning_rate = ['0.08', '0.1','0.12','0.14','0.16']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 20, num = 5)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child weight parameters
min_child_weight=[6,7,8,9]

In [None]:

# Create the random grid
random_grid2= {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_weight': min_child_weight}

print(random_grid2)

{'n_estimators': [400, 444, 488, 533, 577, 622, 666, 711, 755, 800], 'learning_rate': ['0.08', '0.1', '0.12', '0.14', '0.16'], 'max_depth': [5, 8, 12, 16, 20], 'subsample': [0.7, 0.6, 0.8], 'min_child_weight': [6, 7, 8, 9]}


In [None]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations


xgb_rcv2 = RandomizedSearchCV(estimator = xgb_regressor_rcv2, param_distributions = random_grid2,
                               scoring='neg_mean_squared_error', n_iter = 100, cv = 5, verbose=2, 
                               random_state=100, n_jobs = 1)


In [None]:
xgb_rcv2.fit(X_train,y_train)

In [None]:

xgb_rcv2_prediction=xgb_rcv2.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, xgb_rcv2_prediction))
print('MSE:', metrics.mean_squared_error(y_test, xgb_rcv2_prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, xgb_rcv2_prediction)))

In [None]:
print(xgb_rcv2.best_params_)
print(xgb_rcv2.best_score_)

### **XGBoost Regressor Implementation with Hyperparameter Tuning-GridSerchCV**

#### GridSerchCV-1


In [None]:
xgb_regressor_gcv1=xgb.XGBRegressor()

In [None]:
## Hyperparameter optimization using GridSearchCV
from sklearn.model_selection import GridSearchCV
## Hyper Parameter Optimization

params_grid_1={
  "loss": ["ls","lad","huber"],
  "n_estimators":[int(x) for x in np.linspace(start = 400, stop = 800, num = 12)],
 'learning_rate': ['0.08', '0.1','0.12','0.14'],
  'max_depth':[int(x) for x in np.linspace(12, 24, num = 5)],
  'subsample':[0.7,0.6,0.8],
  'min_child_weight':[6,7,8]   
}

In [None]:

xgb_gcv1=GridSearchCV( xgb_regressor_gcv1,param_grid=params_grid_1,scoring='neg_mean_squared_error',n_jobs=-1,cv=10,verbose=10)
xgb_gcv1.fit(X_train,y_train)

Fitting 10 folds for each of 6480 candidates, totalling 64800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  1

In [None]:
from sklearn import metrics
xgb_gcv_prediction_1=xgb_gcv1.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, xgb_gcv_prediction_1))
print('MSE:', metrics.mean_squared_error(y_test, xgb_gcv_prediction_1))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, xgb_gcv_prediction_1)))

In [None]:
import pickle
# open a file, where you ant to store the data
file = open('xgb_gridcv1.pkl', 'wb')

# dump information to that file
pickle.dump(xgb_gcv1, file)

In [None]:
import pickle
# open a file, where you ant to store the data
file = open('xgb_randomcv1.pkl', 'wb')

# dump information to that file
pickle.dump(xgb_rcv1, file)

In [None]:
print(xgb_gcv1.best_params_)
print(xgb_gcv1.best_score_)