# Ensemble learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Load dataset

In [92]:
dataobj = load_iris()
X = pd.DataFrame(dataobj.data[50:, [1,2]], columns=dataobj.feature_names[:2])
y = dataobj.target[50:]
y = np.where(y == 2, 1, 0)
#np.unique(y)

df = X
df['target'] = y

df

Unnamed: 0,sepal length (cm),sepal width (cm),target
0,3.2,4.7,0
1,3.2,4.5,0
2,3.1,4.9,0
3,2.3,4.0,0
4,2.8,4.6,0
...,...,...,...
95,3.0,5.2,1
96,2.5,5.0,1
97,3.0,5.2,1
98,3.4,5.4,1


# Split data

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, random_state=1, stratify=y)
print(f'X_train.shape = {X_train.shape}')
print(f'X_test.shape = {X_test.shape}')

X_train.shape = (70, 3)
X_test.shape = (30, 3)


# Making beselinemodel

In [46]:
from sklearn import metrics
from sklearn.metrics import make_scorer

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_validate

In [47]:
## Logistic regression
clf_lr = LogisticRegression(C=.001, penalty='l2', random_state=0)
pipe_lr = Pipeline([('scl', StandardScaler()), ('clf_lr', clf_lr)])

## Decision Tree
clf_dt = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0)
pipe_dt = Pipeline([('scl', StandardScaler()), ('clf_dt', clf_dt)])

## KNN
clf_knn = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')
pipe_knn = Pipeline([('scl', StandardScaler()), ('clf_knn', clf_knn)])

pipes = [pipe_lr, pipe_dt, pipe_knn]
names = ['LogisticRegression', 'DecisionTreeClassifier', 'KNeighborsClassifier']

In [48]:
## Make scorer
## accuracy
score_acc = make_scorer(metrics.accuracy_score)
## precision
score_pre = make_scorer(metrics.precision_score, zero_division=0, pos_label=1, average='weighted')
## recall
score_re = make_scorer(metrics.recall_score, zero_division=0, pos_label=1, average='weighted')
## f1 score
score_f1 = make_scorer(metrics.f1_score, pos_label=1, average='weighted')
## AOC ROC
score_rocaoc = make_scorer(metrics.roc_auc_score)

scorings ={
    'accuracy' : score_acc,
    'precision' : score_pre,
    'recall' : score_re,
    'f1' : score_f1,
    'rocauc' : score_rocaoc,
}

In [77]:
result = []
for pipe, name in zip(pipes, names):
    cv = cross_validate(
        estimator=pipe,
        X = X_train,
        y = y_train,
        cv=10,
        n_jobs=-1,
        scoring=scorings
    )
    
    acc = cv['test_accuracy']
    pre = cv['test_precision']
    re = cv['test_recall']
    f1 = cv['test_f1']
    aucroc = cv['test_rocauc']
    
    re = {
        'clf_name' : name,
        'ACC' : f'{acc.mean():6.3f} (+/- {acc.std():6.3f})',
        'PRE' : f'{pre.mean():6.3f} (+/- {pre.std():6.3f})',
        'REC' : f'{re.mean():6.3f} (+/- {re.std():6.3f})',
        'F1' : f'{f1.mean():6.3f} (+/- {f1.std():6.3f})',
        'AOCROC' : f'{aucroc.mean():6.3f} (+/- {aucroc.std():6.3f})',
    }
    
    result.append(re)
pd.DataFrame(result).set_index('clf_name')

Unnamed: 0_level_0,ACC,PRE,REC,F1,AOCROC
clf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.900 (+/- 0.112),0.934 (+/- 0.070),0.900 (+/- 0.112),0.898 (+/- 0.116),0.912 (+/- 0.098)
DecisionTreeClassifier,1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000)
KNeighborsClassifier,1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000)


# Ensembel (meta) model from baseline model

In [61]:
from sklearn.ensemble import VotingClassifier

estimators = [('pipe_lr', pipe_lr), ('pipe_dt', pipe_dt), ('pipe_knn', pipe_knn)]
eclf = VotingClassifier(estimators=estimators, voting='soft')

In [78]:
cv = cross_validate(
    estimator=eclf,
    X = X_train,
    y = y_train,
    cv=10,
    n_jobs=-1,
    scoring=scorings
)

acc = cv['test_accuracy']
pre = cv['test_precision']
re = cv['test_recall']
f1 = cv['test_f1']
aucroc = cv['test_rocauc']

re = {
    'clf_name' : 'Votting Classifed',
    'ACC' : f'{acc.mean():6.3f} (+/- {acc.std():6.3f})',
    'PRE' : f'{pre.mean():6.3f} (+/- {pre.std():6.3f})',
    'REC' : f'{re.mean():6.3f} (+/- {re.std():6.3f})',
    'F1' : f'{f1.mean():6.3f} (+/- {f1.std():6.3f})',
    'AOCROC' : f'{aucroc.mean():6.3f} (+/- {aucroc.std():6.3f})',
}

result.append(re)
pd.DataFrame(result).set_index('clf_name')

Unnamed: 0_level_0,ACC,PRE,REC,F1,AOCROC
clf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.900 (+/- 0.112),0.934 (+/- 0.070),0.900 (+/- 0.112),0.898 (+/- 0.116),0.912 (+/- 0.098)
DecisionTreeClassifier,1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000)
KNeighborsClassifier,1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000)
Votting Classifed,1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000),1.000 (+/- 0.000)


In [81]:
## re fitting model
eclf.fit(X_train, y_train)

# Evaluate model

In [87]:
## evaluate model
models = [pipe_lr, pipe_dt, pipe_knn, eclf]
names = ['LogisticRegression', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'Ensambel']

eval_re = []

for model, name in zip(models, names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    ## classification report
    print(f'Model : {name} \n')
    print(metrics.classification_report(y_test, y_pred))
    print('--'*50 + '\n')
    
    acc = scorings['accuracy'](model, X_test, y_test)
    pre = scorings['precision'](model, X_test, y_test)
    re = scorings['recall'](model, X_test, y_test)
    f1 = scorings['f1'](model, X_test, y_test)
    aucroc = scorings['rocauc'](model, X_test, y_test)
    
    re = {
        'clf_name' : name,
        'ACC' : acc,
        'PRE' : pre,
        'REC' : re,
        'F1' : f1,
        'AOCROC' : aucroc
    }
    
    eval_re.append(re)
    
pd.DataFrame(eval_re).set_index('clf_name')

Model : LogisticRegression 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        15

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

----------------------------------------------------------------------------------------------------

Model : DecisionTreeClassifier 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        15

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

----------------------------------------------------------------------------------------------------

Model : KNeighborsClassifier 

              precision    recall  f1-score   sup

Unnamed: 0_level_0,ACC,PRE,REC,F1,AOCROC
clf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,1.0,1.0,1.0,1.0,1.0
DecisionTreeClassifier,1.0,1.0,1.0,1.0,1.0
KNeighborsClassifier,1.0,1.0,1.0,1.0,1.0
Ensambel,1.0,1.0,1.0,1.0,1.0


In [101]:
eclf.get_params()

{'estimators': [('pipe_lr',
   Pipeline(steps=[('scl', StandardScaler()),
                   ('clf_lr', LogisticRegression(C=0.001, random_state=0))])),
  ('pipe_dt',
   Pipeline(steps=[('scl', StandardScaler()),
                   ('clf_dt',
                    DecisionTreeClassifier(criterion='entropy', max_depth=1,
                                           random_state=0))])),
  ('pipe_knn',
   Pipeline(steps=[('scl', StandardScaler()),
                   ('clf_knn', KNeighborsClassifier(n_neighbors=1))]))],
 'flatten_transform': True,
 'n_jobs': None,
 'verbose': False,
 'voting': 'soft',
 'weights': None,
 'pipe_lr': Pipeline(steps=[('scl', StandardScaler()),
                 ('clf_lr', LogisticRegression(C=0.001, random_state=0))]),
 'pipe_dt': Pipeline(steps=[('scl', StandardScaler()),
                 ('clf_dt',
                  DecisionTreeClassifier(criterion='entropy', max_depth=1,
                                         random_state=0))]),
 'pipe_knn': Pipeline(steps=[('

# Grid Search CV

In [105]:
## hyper parameter
params = {
    'pipe_dt__clf_dt__max_depth' : [1, 2 , 5, 10 , None],
    'pipe_knn__clf_knn__n_neighbors' : [1, 2, 3, 4, 5, 6 ,7],
    'pipe_lr__clf_lr__C' : [.0001, .001, .01, .1, 1, 10, 100, 1000]
}

## grid search
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(
    estimator=eclf,
    param_grid=params,
    scoring='accuracy',
    cv=10,
    n_jobs=-1
)

gs

In [106]:
## fitting model
gs.fit(X_train, y_train)

In [108]:
pd.DataFrame(gs.cv_results_).sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pipe_dt__clf_dt__max_depth,param_pipe_knn__clf_knn__n_neighbors,param_pipe_lr__clf_lr__C,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.013434,0.004236,0.003261,0.001663,1,1,0.0001,"{'pipe_dt__clf_dt__max_depth': 1, 'pipe_knn__c...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
177,0.006665,0.002675,0.001947,0.000767,10,2,0.001,"{'pipe_dt__clf_dt__max_depth': 10, 'pipe_knn__...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
178,0.008682,0.003147,0.002979,0.001344,10,2,0.01,"{'pipe_dt__clf_dt__max_depth': 10, 'pipe_knn__...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
179,0.010685,0.005866,0.002221,0.001561,10,2,0.1,"{'pipe_dt__clf_dt__max_depth': 10, 'pipe_knn__...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
180,0.006459,0.002691,0.003278,0.002523,10,2,1.0,"{'pipe_dt__clf_dt__max_depth': 10, 'pipe_knn__...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


In [109]:
gs.best_score_

1.0