# BOOSTING

Boosting is a method for combining a series of simple individual models to create a more powerful model. 
<br>Start by fitting an initial model (mostly a tree) to the data. 
<br>Then build a second model that focuses on accurately predicting the cases where the first model performs poorly. 
<br>The combination of these two models is expected to be better than either model alone. 
<br>Repeat the process many times. 
<br>Each successive model attempts to correct for the shortcomings of the combined ensemble of all previous models.

<br>The best possible next model, when combined with previous models, minimizes the overall prediction error.

### Objective: Build a classifier using the boosting algorithms.

Build a classifier of boosting type on **UniversalBank** data.
-  AdaBoost (Adaptive Boosting).
-  Gradient Boosting.
-  eXtreme Gradient Boosting 

## Import Libraries

In [2]:
## Importing Libraries
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [3]:
#!pip install seaborn

## Loading data

In [4]:
## Read "UniversalBank.csv" using pandas
unibank = pd.read_csv("/home/gowtham_satya/Hema/7305(machine learning))/lab notes/20200201_Batch79_CSE7305c_RF_Activity/UniversalBank.csv")

In [5]:
## Print the first 7 rows
unibank.head(7)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
5,6,37,13,29,92121,4,0.4,2,155,0,0,0,1,0
6,7,53,27,72,91711,2,1.5,2,0,0,0,0,1,0


In [6]:
unibank.describe(include ="all")

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [7]:
## Check the datatype of each variable
unibank.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object

## Pre-Processing

In [8]:
## Drop columns which are not significant
unibank.drop(["ID","ZIP Code"],axis=1,inplace=True)

In [9]:
## Convert Categorical Columns to Dummies
cat_cols = ["Family","Education","Personal Loan","Securities Account","CD Account","Online","CreditCard"]
unibank = pd.get_dummies(unibank,columns=cat_cols,drop_first=True,)

In [12]:
## Split the data into X and y
X = unibank.copy().drop("Personal Loan_1",axis=1)
y = unibank["Personal Loan_1"]

In [10]:
unibank.describe(include ="all")

Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage,Family_2,Family_3,Family_4,Education_2,Education_3,Personal Loan_1,Securities Account_1,CD Account_1,Online_1,CreditCard_1
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,45.3384,20.1046,73.7742,1.937938,56.4988,0.2592,0.202,0.2444,0.2806,0.3002,0.096,0.1044,0.0604,0.5968,0.294
std,11.463166,11.467954,46.033729,1.747659,101.713802,0.438239,0.401532,0.429774,0.449337,0.458391,0.294621,0.305809,0.23825,0.490589,0.455637
min,23.0,-3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35.0,10.0,39.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,45.0,20.0,64.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,55.0,30.0,98.0,2.5,101.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
max,67.0,43.0,224.0,10.0,635.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
## Split the data into X_train, X_test, y_train, y_test with test_size = 0.20 using sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [14]:
## Print the shape of X_train, X_test, y_train, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4000, 14)
(1000, 14)
(4000,)
(1000,)


In [15]:
## Scale the numeric attributes
scaler = StandardScaler()
scaler.fit(X_train.iloc[:,:5])

X_train.iloc[:,:5] = scaler.transform(X_train.iloc[:,:5])
X_test.iloc[:,:5] = scaler.transform(X_test.iloc[:,:5])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


## Model Building

In [16]:
# import modules as necessary
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#### Creating Adaboost Classifier

In [17]:
# Create adaboost-decision tree classifer object
Adaboost_model = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators = 600,
    learning_rate = 1)

#### Train Adaboost Classifer

In [18]:
# Train model
Adaboost_model.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

#### Predictions on Test set

In [19]:
# Predict on Test 
y_preds = Adaboost_model.predict(X_test)

#### Accuracy

In [20]:
# Verify accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_preds))

0.984


### GridSearch Cross validation
#### Creating Adaboost Classifier

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators' : [100, 150, 200],
              'learning_rate' : [0.1, 0.5, 0.9]}

Adaboost_model_clf = GridSearchCV(AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=2)), param_grid, n_jobs=-1)

#### Train Adaboost Classifer

In [22]:
# Train model
Adaboost_model_clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=2,
                                                                                max_features=None,
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
 

#### Best parameters set and model

In [23]:
# Find best model
best_ada_model = Adaboost_model_clf.best_estimator_
print (Adaboost_model_clf.best_score_, Adaboost_model_clf.best_params_) 

0.9865 {'learning_rate': 0.5, 'n_estimators': 100}


#### Predictions on Test set

In [24]:
y_pred_test = best_ada_model.predict(X_test)

#### Accuracy

In [25]:
# Verify accuracy
print(accuracy_score(y_test,y_pred_test))

0.986


### Building Gradient Boosting Classifier

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
GBM_model = GradientBoostingClassifier(n_estimators=50,
                                       learning_rate=0.3,
                                       subsample=0.8)

#### Train Gradient Boosting Classifer

In [28]:
GBM_model.fit(X=X_train, y=y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.3, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

#### Predictions on Test set

In [29]:
y_pred = GBM_model.predict(X_test)

#### Accuracy

In [30]:
print(accuracy_score(y_test,y_pred))

0.985


### GridSearch Cross validation

#### Creating Gradient Boosting Classifier

In [31]:
from sklearn.model_selection import GridSearchCV

# Model in use
GBM = GradientBoostingClassifier() 
 
# Use a grid over parameters of interest
param_grid = { 
           "n_estimators" : [100,150,200,250],
           "max_depth" : [5, 10],
           "learning_rate" : [0.1,0.5,0.9]}
 
CV_GBM = GridSearchCV(estimator=GBM, param_grid=param_grid, cv= 10)

#### Train Gradient Boosting Classifier

In [32]:
CV_GBM.fit(X=X_train, y=y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
        

#### Best parameters set and model

In [33]:
# Finding best model
best_gbm_model = CV_GBM.best_estimator_
print (CV_GBM.best_score_, CV_GBM.best_params_)

0.9877499999999999 {'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 100}


#### Predictions on Test set

In [34]:
y_pred_test=best_gbm_model.predict(X_test)

#### Accuracy

In [35]:
print(accuracy_score(y_test,y_pred_test))

0.986


In [36]:
!pip install xgboost
from xgboost import XGBClassifier
XGB_model = XGBClassifier()



#### Train XGBoost Classifer

In [37]:
XGB_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

#### Predictions on Test set

In [38]:
y_pred = XGB_model.predict(X_test)

#### Accuracy

In [39]:
print(accuracy_score(y_test,y_pred))

0.988


### GridSearch Cross validation

#### Creating XGBoost Classifier

In [40]:
XGB = XGBClassifier(n_jobs=-1)
 
# Use a grid over parameters of interest
param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 5),
     'n_estimators':[100, 200],
     'max_depth': [10, 15, 20, 25]
}

 
CV_XGB = GridSearchCV(estimator=XGB, param_grid=param_grid, cv= 10)

#### Train XGBoost Classifer

In [41]:
CV_XGB.fit(X = X_train, y=y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_esti...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,

#### Best parameters set and model

In [42]:
# Find best model
best_xgb_model = CV_XGB.best_estimator_
print (CV_XGB.best_score_, CV_XGB.best_params_)

0.9890000000000001 {'colsample_bytree': 0.6, 'max_depth': 10, 'n_estimators': 100}


#### Predictions on Test set

In [43]:
y_pred_test=best_xgb_model.predict(X_test)

#### Accuracy

In [44]:
print(accuracy_score(y_test,y_pred_test))

0.989
