**HyperParam tuning using gridSearch**

In [2]:
#9th main
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
concrete = pd.read_csv("./datasets/concrete_data.csv")

concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [4]:
X = concrete.drop("csMPa", axis = 1)
Y = concrete.csMPa

In [6]:
X = X.drop(["flyash", "coarseaggregate", "fineaggregate"], axis = 1)  # done to reduce dim, and help increase speed

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [9]:
# baseline model, i.e new model acc mustn't be lower than this
baseline = GradientBoostingRegressor(max_depth=3, n_estimators=50)
baseline.fit(x_train, y_train)

GradientBoostingRegressor(n_estimators=50)

In [11]:
from sklearn.metrics import r2_score

y_pred = baseline.predict(x_test)

r2_score(y_test, y_pred)

0.8839824620612463

In [13]:
# get importance of feature to the model
important_features = pd.Series(baseline.feature_importances_, index = X.columns).sort_values(ascending = False)

important_features

age                 0.373695
cement              0.336937
water               0.106723
slag                0.092027
superplasticizer    0.090618
dtype: float64

In [14]:
# gridSearch
gbr = GradientBoostingRegressor(max_depth=3)

parameters = {"n_estimators": [1, 5, 10, 50, 100, 200, 300, 400, 500]}

gridsearch_reg = GridSearchCV(gbr, param_grid=parameters, cv = 3, n_jobs=-1)

gridsearch_reg.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'n_estimators': [1, 5, 10, 50, 100, 200, 300, 400,
                                          500]})

In [15]:
gridsearch_reg.best_params_

{'n_estimators': 500}

In [28]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [18]:
gbr_best = GradientBoostingRegressor(max_depth=3, n_estimators=gridsearch_reg.best_params_["n_estimators"])
gbr_best.fit(x_train, y_train)

GradientBoostingRegressor(n_estimators=500)

In [21]:
y_pred = gbr_best.predict(x_test)

r2_score(y_test, y_pred)

0.908121532281322

***
**HyperParam tuning using EarlyStopping**

In [43]:
"""
warmstart = True, means reuse solutions of previous call to predictor to fit and add
more estimators of ensemble
"""
gbr = GradientBoostingRegressor(max_depth = 3, warm_start = True)

In [44]:
# early stopping code
min_val_error = float("inf")
error_increasing = 0

for n_estimators in range(1, 1000):
    gbr.n_estimators = n_estimators
    gbr.fit(x_train, y_train)
    
    y_pred = gbr.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred)
    
    print("No. of estimators: ", gbr.n_estimators)
    print("Error: ", val_error)
    
    if val_error < min_val_error:  # error reducing
        min_val_error = val_error  # updates min error value
        error_increasing = 0
    else:
        error_increasing += 1
        if error_increasing == 10:   # error keeps increasing for 6 times i.e overfitting
            break
        

No. of estimators:  1
Error:  248.25361502502042
No. of estimators:  2
Error:  220.49859594097288
No. of estimators:  3
Error:  200.19143337231247
No. of estimators:  4
Error:  181.14100447646504
No. of estimators:  5
Error:  166.56320389126617
No. of estimators:  6
Error:  155.31045711839968
No. of estimators:  7
Error:  144.5611978815204
No. of estimators:  8
Error:  133.27717145265842
No. of estimators:  9
Error:  125.56838158487218
No. of estimators:  10
Error:  117.90978813630247
No. of estimators:  11
Error:  109.67586287218188
No. of estimators:  12
Error:  104.82235188043342
No. of estimators:  13
Error:  98.63437768498775
No. of estimators:  14
Error:  92.46391267192352
No. of estimators:  15
Error:  86.86636980245237
No. of estimators:  16
Error:  82.87443680378821
No. of estimators:  17
Error:  80.3824670285934
No. of estimators:  18
Error:  76.65734080884852
No. of estimators:  19
Error:  74.331669450294
No. of estimators:  20
Error:  71.05805838420815
No. of estimators:  2

In [45]:
n_estimators # best num of estimators from early stopping

441

In [46]:
# reshuffle train and test data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [47]:
gbr_best = GradientBoostingRegressor(max_depth = 3, n_estimators = n_estimators)

gbr_best.fit(x_train, y_train)

GradientBoostingRegressor(n_estimators=441)

In [48]:
y_pred = gbr_best.predict(x_test)

r2_score(y_test, y_pred)

0.9419418381848697