# Bagging

# Load library

In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_wine
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Load dataset

In [116]:
dataobj = load_wine()
X = pd.DataFrame(dataobj.data, columns=dataobj.feature_names)
y = dataobj.target

df = X
df['target'] = y
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [117]:
df.target.value_counts()

target
1    71
0    59
2    48
Name: count, dtype: int64

In [118]:
## selecting only 2 class
df = df[(df.target == 0) | (df.target == 1)]
X = df.iloc[:, :len(df.columns) - 1]
y = df.target

X

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,12.07,2.16,2.17,21.0,85.0,2.60,2.65,0.37,1.35,2.76,0.86,3.28,378.0
126,12.43,1.53,2.29,21.5,86.0,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352.0
127,11.79,2.13,2.78,28.5,92.0,2.13,2.24,0.58,1.76,3.00,0.97,2.44,466.0
128,12.37,1.63,2.30,24.5,88.0,2.22,2.45,0.40,1.90,2.12,0.89,2.78,342.0


# Split data

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, random_state=1, stratify=y)
print(f'X_train.shape = {X_train.shape}')
print(f'X_test.shape = {X_test.shape}')

X_train.shape = (91, 13)
X_test.shape = (39, 13)


# Scoring

In [120]:
from sklearn.metrics import make_scorer
## accuracy
acc = make_scorer(metrics.accuracy_score)
pre = make_scorer(metrics.precision_score, zero_division=0, pos_label=1, average='weighted')
rec = make_scorer(metrics.recall_score, zero_division=0, pos_label=1, average='weighted')
f1 = make_scorer(metrics.f1_score, pos_label=1, average='weighted')
rocauc = make_scorer(metrics.roc_auc_score)

scorings = {
    'accuracy' : acc,
    'precision' : pre,
    'recall' : rec,
    'f1' : f1,
    'rocauc' : rocauc
}

# Dicision Tree

In [121]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=None,
    random_state=1
)

In [122]:
## Training score
from sklearn.model_selection import cross_validate
result = []

cv = cross_validate(
    estimator=tree,
    X=X_test,
    y=y_test,
    cv=10,
    n_jobs=-1,
    scoring=scorings
)

ACC = cv['test_accuracy']
PRE = cv['test_precision']
REC = cv['test_recall']
F1 = cv['test_f1']
ROCAUC = cv['test_rocauc']

re = {
    'model name' : 'DecisionTree (Train)',
    'accuracy' : f'{np.mean(ACC):6.3f}',
    'precision' : f'{np.mean(PRE):6.3f}',
    'recall' : f'{np.mean(REC):6.3f}',
    'f1' : f'{np.mean(F1):6.3f}',
    'roc_auc' : f'{np.mean(ROCAUC):6.3f}'
}

result.append(re)
pd.DataFrame(result).set_index('model name')

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
model name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTree (Train),0.817,0.842,0.817,0.793,0.825


In [123]:
## Testing score
tree.fit(X_train, y_train)

ACC = scorings['accuracy'](tree, X_test, y_test)
PRE = scorings['precision'](tree, X_test, y_test)
REC = scorings['recall'](tree, X_test, y_test)
F1 = scorings['f1'](tree, X_test, y_test)
ROCAUC = scorings['rocauc'](tree, X_test, y_test)

re = {
    'model name' : 'DecisionTree (Test)',
    'accuracy' : f'{np.mean(ACC):6.3f}',
    'precision' : f'{np.mean(PRE):6.3f}',
    'recall' : f'{np.mean(REC):6.3f}',
    'f1' : f'{np.mean(F1):6.3f}',
    'roc_auc' : f'{np.mean(ROCAUC):6.3f}'
}

result.append(re)
pd.DataFrame(result).set_index('model name')

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
model name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTree (Train),0.817,0.842,0.817,0.793,0.825
DecisionTree (Test),0.974,0.976,0.974,0.974,0.972


# Decision tree with AdaBoost

In [124]:
from sklearn.ensemble import AdaBoostClassifier

In [125]:
base = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=1,
    random_state=1
)

bag = AdaBoostClassifier(
    estimator=base,
    n_estimators=500,
    learning_rate=1,
    random_state=1
)

In [126]:
cv = cross_validate(
    estimator=bag,
    X=X_test,
    y=y_test,
    cv=10,
    n_jobs=-1,
    scoring=scorings
)

ACC = cv['test_accuracy']
PRE = cv['test_precision']
REC = cv['test_recall']
F1 = cv['test_f1']
ROCAUC = cv['test_rocauc']

re = {
    'model name' : 'AdaBoost (Train)',
    'accuracy' : f'{np.mean(ACC):6.3f}',
    'precision' : f'{np.mean(PRE):6.3f}',
    'recall' : f'{np.mean(REC):6.3f}',
    'f1' : f'{np.mean(F1):6.3f}',
    'roc_auc' : f'{np.mean(ROCAUC):6.3f}'
}

result.append(re)
pd.DataFrame(result).set_index('model name')

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
model name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTree (Train),0.817,0.842,0.817,0.793,0.825
DecisionTree (Test),0.974,0.976,0.974,0.974,0.972
AdaBoost (Train),0.842,0.819,0.842,0.807,0.825


In [127]:
bag.fit(X_train, y_train)

ACC = scorings['accuracy'](tree, X_test, y_test)
PRE = scorings['precision'](tree, X_test, y_test)
REC = scorings['recall'](tree, X_test, y_test)
F1 = scorings['f1'](tree, X_test, y_test)
ROCAUC = scorings['rocauc'](tree, X_test, y_test)

re = {
    'model name' : 'AdaBoost (Test)',
    'accuracy' : f'{np.mean(ACC):6.3f}',
    'precision' : f'{np.mean(PRE):6.3f}',
    'recall' : f'{np.mean(REC):6.3f}',
    'f1' : f'{np.mean(F1):6.3f}',
    'roc_auc' : f'{np.mean(ROCAUC):6.3f}'
}

result.append(re)
pd.DataFrame(result).set_index('model name')

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
model name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTree (Train),0.817,0.842,0.817,0.793,0.825
DecisionTree (Test),0.974,0.976,0.974,0.974,0.972
AdaBoost (Train),0.842,0.819,0.842,0.807,0.825
AdaBoost (Test),0.974,0.976,0.974,0.974,0.972
