# ENSEMBLE METHODS PART 2

## CONTENTS

__1.INTRODUCTION__

__2.LIBRARIES AND DATA__
       
__3.PREPROCESSING__

__4.ADABOOST__
             
__5.GRADIENT BOOSTING__

__6.RESULTS__


## 1. INTRODUCTION

In this Jupyter Notebook, we'll make use of the ensemble methods AdaBoost and Gradient Boosting.

## 2. LIBRARIES AND DATA

__Libraries__

In [1]:
import pandas as pd
import numpy as np
from numpy import arange
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 100)

__Importing data__

In [2]:
data1 = pd.read_csv("train_dataset.csv") 
data2 = pd.read_csv("test_dataset.csv") 

# Train and test Dataset
train_df = data1.copy()
test_df = data2.copy()

In [3]:
train_df.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
0,245.0,95,196.0,535.2,46870,33642,14.4,0.0,46611.3,42.0,40.4,44.3,2.31,51.1,17.2,43.0,4.7,44.1,9.6,54.6,5.4,70.2,46.0,38.5,19.5,96.374269,1.684211,0.526316,0.181287,47.89838,4.048468,0,0
1,241.0,112,182.4,408.4,36424,41459,18.8,0.0,35815.95,45.8,44.9,46.7,2.47,55.0,22.0,43.5,3.2,37.2,11.6,47.7,9.2,52.4,28.7,44.9,23.5,66.207829,0.304282,1.144679,1.113285,52.937625,4.550419,0,0


In [4]:
test_df.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
0,104.0,46,170.4,401.9,38504,22633,21.5,0.0,38888.25,34.0,32.6,35.6,2.28,43.1,4.9,13.0,5.8,33.6,14.1,51.8,8.4,65.3,43.2,35.9,19.1,73.337436,23.915432,0.835128,0.492286,44.13086,3.697749,0,0
1,20.0,10,187.4,400.6,43883,3091,12.1,0.0,43962.7,49.5,47.7,50.5,2.21,61.0,11.9,14.8,14.2,42.3,10.0,62.2,3.1,75.6,44.3,35.3,14.8,97.90795,1.223045,0.193112,0.41841,53.957879,7.992565,0,0


## 3. PREPROCESSING

In [5]:
# Defining kfold validation
cv = KFold(n_splits=5) 

In [6]:
# Defining X_train, Y_train, X_test, Y_test
X_train = train_df.drop(columns = 'TARGET_deathRate')
Y_train = train_df[ 'TARGET_deathRate']

X_test = test_df.drop(columns = 'TARGET_deathRate')
Y_test = test_df[ 'TARGET_deathRate']

In [7]:
X_train.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
0,245.0,95,535.2,46870,33642,14.4,0.0,46611.3,42.0,40.4,44.3,2.31,51.1,17.2,43.0,4.7,44.1,9.6,54.6,5.4,70.2,46.0,38.5,19.5,96.374269,1.684211,0.526316,0.181287,47.89838,4.048468,0,0
1,241.0,112,408.4,36424,41459,18.8,0.0,35815.95,45.8,44.9,46.7,2.47,55.0,22.0,43.5,3.2,37.2,11.6,47.7,9.2,52.4,28.7,44.9,23.5,66.207829,0.304282,1.144679,1.113285,52.937625,4.550419,0,0


In [8]:
X_test.head(2)

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,binnedInc,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctEmployed16_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Geography_EAST,Geography_WEST
0,104.0,46,401.9,38504,22633,21.5,0.0,38888.25,34.0,32.6,35.6,2.28,43.1,4.9,13.0,5.8,33.6,14.1,51.8,8.4,65.3,43.2,35.9,19.1,73.337436,23.915432,0.835128,0.492286,44.13086,3.697749,0,0
1,20.0,10,400.6,43883,3091,12.1,0.0,43962.7,49.5,47.7,50.5,2.21,61.0,11.9,14.8,14.2,42.3,10.0,62.2,3.1,75.6,44.3,35.3,14.8,97.90795,1.223045,0.193112,0.41841,53.957879,7.992565,0,0


## 4. ADABOOST

In [9]:
# Defining Grid for GridSearchCV

grid = {'base_estimator': [DecisionTreeRegressor(), LinearRegression(), KNeighborsRegressor()], 
        'n_estimators': [5,10,15,20,25],
        'learning_rate': [0.001, 0.01, 0.1, 0.3, 0.5],
        'loss' : ['linear', 'square', 'exponential']}

In [10]:
# Instantiating GridSearchCV 

AdaBoost_search = GridSearchCV(AdaBoostRegressor(random_state = 126), 
                            grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1 ,verbose = 1)

In [11]:
# Searching for the best parameters 

CV_AdaBoost_results = AdaBoost_search.fit(X_train, Y_train)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits


In [12]:
# Displaying the best parameters 

AdaBoost_Best_Parameters = pd.DataFrame(CV_AdaBoost_results.best_params_,
                                        index = ['Best AdaBoost Regression Parameters'])

AdaBoost_Best_Parameters = AdaBoost_Best_Parameters.transpose()
AdaBoost_Best_Parameters

Unnamed: 0,Best AdaBoost Regression Parameters
base_estimator,DecisionTreeRegressor()
learning_rate,0.5
loss,linear
n_estimators,25


In [13]:
# Storing the values of the best parameters

best_estimator = CV_AdaBoost_results.best_params_['base_estimator']
best_n_estimators = CV_AdaBoost_results.best_params_['n_estimators']
best_learning_rate = CV_AdaBoost_results.best_params_['learning_rate']
best_loss = CV_AdaBoost_results.best_params_['loss']

In [14]:
# Instantiating the best model

AdaBoostModel = AdaBoostRegressor(random_state = 126,
                                 base_estimator = best_estimator,
                                 n_estimators = best_n_estimators, 
                                  learning_rate = best_learning_rate, 
                                  loss = best_loss).fit(X_train, Y_train)


In [15]:
# Getting the R^2

AB_R2 = np.round(AdaBoostModel.score(X_train, Y_train),3)

In [16]:
# Getting predictions and training MSE 

AdaBoostPredictions = AdaBoostModel.predict(X_train)
AdaBoost_Training_MSE = np.round(mean_squared_error(Y_train, AdaBoostPredictions),2)

In [17]:
# Getting kfold MSE 

adaboost_cv_MSE = round(np.mean(cross_val_score(AdaBoostModel, X_train, Y_train, cv=cv, 
                                          scoring='neg_mean_squared_error', error_score='raise') * (-1)),1)


In [18]:
# Getting predictions and test MSE 

ADABOOST_predictions = AdaBoostModel.predict(X_test)
ADABOOST_Test_MSE = round(mean_squared_error(Y_test, ADABOOST_predictions), 2)

In [19]:
# Dictionary that holds the performance metrics 

dict1 = {'AdaBoost': (np.round(AB_R2,3), AdaBoost_Training_MSE,
                                            adaboost_cv_MSE, ADABOOST_Test_MSE )}

In [20]:
# Creating a dataframe to easily present the results 

Evaluation_df = pd.DataFrame(dict1, index=['R^2','Training_MSE','Training_MSE (KFold)', 'Test_MSE'])

In [21]:
# Displaying the metrics

Evaluation_df

Unnamed: 0,AdaBoost
R^2,1.0
Training_MSE,0.1
Training_MSE (KFold),363.1
Test_MSE,295.11


## 5.GRADIENT BOOSTING

In [22]:
# Defining Grid for GridSearchCV

grid = {'min_samples_split' : [3,4,5],
       'min_samples_leaf' : [4,5,6],
       'max_depth' : [3,6,9,None],
       'max_features' : ['sqrt', None],
       'learning_rate': [0.001, 0.01],
       'n_estimators': [5, 10, 15]}

In [23]:
# Instantiating GridSearchCV 

GraBoost_search = GridSearchCV(GradientBoostingRegressor(random_state = 126), 
                            grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1 ,verbose = 1)

In [None]:
# Searching for the best parameters 

CV_GraBoost_results = GraBoost_search.fit(X_train, Y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [None]:
# Displaying the best parameters 

GraBoost_Best_Parameters = pd.DataFrame(CV_GraBoost_results.best_params_,
                                        index = ['Best Gradient Boosting Regression Parameters'])

GraBoost_Best_Parameters = GraBoost_Best_Parameters.transpose()
GraBoost_Best_Parameters

In [None]:
# Storing the values of the best parameters

best_min_samples_split = CV_GraBoost_results.best_params_['min_samples_split']
best_min_samples_leaf = CV_GraBoost_results.best_params_['min_samples_leaf']
best_max_depth = CV_GraBoost_results.best_params_['max_depth']
best_max_features = CV_GraBoost_results.best_params_['max_features']
best_learning_rate = CV_GraBoost_results.best_params_['learning_rate']
best_n_estimators = CV_GraBoost_results.best_params_['n_estimators']

In [None]:
# Instantiating the best model

GraBoostModel = GradientBoostingRegressor(random_state = 126, 
                                  min_samples_split = best_min_samples_split,
                                  min_samples_leaf = best_min_samples_leaf, 
                                  max_depth = best_max_depth,
                                  max_features = best_max_features,
                                  learning_rate = best_learning_rate,
                                  n_estimators = best_n_estimators).fit(X_train,Y_train)

In [None]:
# Getting the R^2

GB_R2 = np.round(GraBoostModel.score(X_train,Y_train),3)

In [None]:
# Getting predictions and training MSE 

GraBoostPredictions = GraBoostModel.predict(X_train)
GraBoost_Training_MSE = np.round(mean_squared_error(Y_train, GraBoostPredictions),2)

In [None]:
# Getting kfold MSE 

graboost_cv_MSE = round(np.mean(cross_val_score(GraBoostModel, X_train, Y_train, cv=cv, 
                                          scoring='neg_mean_squared_error', error_score='raise') * (-1)),1)

In [None]:
# Getting predictions and test MSE 

GRABOOST_predictions = GraBoostModel.predict(X_test)
GRABOOST_Test_MSE = round(mean_squared_error(Y_test, GRABOOST_predictions), 2)

In [None]:
# Dictionary that holds the performance metrics 

dict2 = {'Gradient Boosting': (np.round(GB_R2,3), GraBoost_Training_MSE,
                                            graboost_cv_MSE, GRABOOST_Test_MSE )}

In [None]:
# Creating a dataframe to easily present the results 

Evaluation_df1 = pd.DataFrame(dict2, index=['R^2','Training_MSE','Training_MSE (KFold)', 'Test_MSE'])

In [None]:
# Displaying the metrics

Evaluation_df1

## 6. RESULTS

__Aggragated Results__

In [None]:
Ensemble_Models_Part2_Results = pd.concat([Evaluation_df, Evaluation_df1], axis = 1)
Ensemble_Models_Part2_Results

__Residuals's Plots__

In [None]:
residuals_adaboost = AdaBoostPredictions - Y_train 

sns.set_style("white")
    
plt.figure(figsize=(5,4))
    
plt.hist(residuals_adaboost, bins = 20, color =  "navy")
    
plt.title(label = "Distribution of Residuals (AdaBoost)" , fontsize = 14, weight = 'bold')
    
sns.despine()
    
plt.show()

In [None]:
residuals_graboost = GraBoostPredictions - Y_train 

sns.set_style("white")
    
plt.figure(figsize=(5,4))
    
plt.hist(residuals_graboost, bins = 20, color = "darkorchid")
    
plt.title(label = "Distribution of Residuals (Gradient Boosting)" , fontsize = 14, weight = 'bold')
    
sns.despine()
    
plt.show()

AdaBoost tends to overfit the data, often resulting in a training error that approaches zero. Despite this tendency, AdaBoost demonstrates strong performance as indicated by acceptable cross-validated and test errors. On the other hand, Gradient Boosting exhibits less favorable performance compared to AdaBoost, struggling to achieve satisfactory results.

__Saving the Results__

In [None]:
Ensemble_Models_Part2_Results.to_csv(r'Ensemble_Models_Part2_Results.csv', index=True, 
                                              index_label= 'Metrics')