# Implementing XGBoost Regressor

In [None]:
#Importing Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from  sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score
from datetime import datetime
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df = pd.read_csv("city_day.csv")
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [None]:
df.drop(["City","Date","NOx", "Benzene", "Toluene", "Xylene","AQI_Bucket"], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,NH3,CO,SO2,O3,AQI
0,,,0.92,18.22,,0.92,27.64,133.36,
1,,,0.97,15.69,,0.97,24.55,34.06,
2,,,17.4,19.3,,17.4,29.07,30.7,
3,,,1.7,18.48,,1.7,18.59,36.08,
4,,,22.1,21.42,,22.1,39.33,39.31,


In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
df

Unnamed: 0,PM2.5,PM10,NO,NO2,NH3,CO,SO2,O3,AQI
0,31.21,38.66,7.20,1.27,25.63,0.56,4.22,2.81,52.0
1,38.39,46.68,7.19,0.91,29.16,0.57,4.46,0.18,60.0
2,43.23,50.83,7.14,1.07,28.95,0.57,4.53,0.41,62.0
3,33.82,41.03,7.09,0.36,28.41,0.48,4.63,0.30,70.0
4,27.14,35.04,5.63,2.32,23.98,0.50,4.71,13.02,54.0
...,...,...,...,...,...,...,...,...,...
14340,15.02,50.94,7.68,25.06,12.47,0.47,8.55,23.30,41.0
14341,24.38,74.09,3.42,26.06,11.99,0.52,12.72,30.14,70.0
14342,22.91,65.73,3.45,29.53,10.71,0.48,8.42,30.96,68.0
14343,16.64,49.97,4.05,29.26,10.03,0.52,9.84,28.30,54.0


In [None]:
#Splitting Data
X = df.iloc[:, :-1] #Independent features
y = df.iloc[:, -1] #Dependent feature

#Train Test Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
#Calling and Fitting model
model = XGBRegressor()
model.fit(X_train,y_train)

In [None]:
prediction = model.predict(X_test)
print("R2 on Training set: ",model.score(X_train,y_train))
print("R2 on Testing set: ",model.score(X_test,y_test))

R2 on Training set:  0.9860918111501309
R2 on Testing set:  0.9042275181597804


In [None]:
print(f"Mean Abs Error: {metrics.mean_absolute_error(y_test, prediction)}")
print(f"Mean Sq Error: {metrics.mean_squared_error(y_test, prediction)}")
print(f"Root Mean Error: {np.sqrt(metrics.mean_squared_error(y_test, prediction))}")

Mean Abs Error: 15.859084929675419
Mean Sq Error: 769.8171156092976
Root Mean Error: 27.745578307350122


# Hyper Parameter Tunning

In [None]:
params = {
    'n_estimators':[500],
    'min_child_weight':[4,5],
    'gamma':[i/10.0 for i in range(3,6)],
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)],
    'max_depth': [2,3,4,6,7],
    'objective': ['reg:squarederror', 'reg:tweedie'],
    'booster': ['gbtree', 'gblinear'],
    'eval_metric': ['rmse'],
    'eta': [i/10.0 for i in range(3,6)],
}

In [None]:
reg = XGBRegressor(nthread=-1)

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(model, param_distributions=params,
                                   n_iter=n_iter_search, cv=5, scoring='neg_mean_squared_error')

start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))

RandomizedSearchCV took 341.54 seconds for 100 candidates parameter settings.


In [None]:
random_search.best_estimator_

# Using Tuned Values

In [None]:
reg_tuned = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, eta=0.4,
             eval_metric='rmse', gamma=0.4, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=2, min_child_weight=5,
             monotone_constraints='()', n_estimators=500, n_jobs=4,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
tuned_model = reg_tuned.fit(X_train, y_train)
y_pred = tuned_model.predict(X_test)
print("R2 on Training set: ",tuned_model.score(X_train, y_train))
print("R2 on Testing set: ",tuned_model.score(X_test, y_test))

R2 on Training set:  0.9554589072374048
R2 on Testing set:  0.8965398070384041


In [None]:
print(f"Mean Abs Error: {metrics.mean_absolute_error(y_test, y_pred)}")
print(f"Mean Sq Error: {metrics.mean_squared_error(y_test, y_pred)}")
print(f"Root Mean Error: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")

Mean Abs Error: 16.29863623837113
Mean Sq Error: 831.6107695627247
Root Mean Error: 28.83766234566742
