<a href="https://colab.research.google.com/github/JaehyunAhn/AI_for_Education/blob/master/VotingClassifier_ensemble_with_earlystopping_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xgboost==1.6.1
!pip install --upgrade scikit-learn

Collecting xgboost==1.6.1
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.9/192.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.1
    Uninstalling xgboost-2.0.1:
      Successfully uninstalled xgboost-2.0.1
Successfully installed xgboost-1.6.1
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [180]:
import xgboost
from sklearn.metrics import f1_score

xgboost.__version__

'2.0.2'

In [181]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Split your data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [182]:
gbc = GradientBoostingClassifier(n_estimators=100, validation_fraction=.2)
lgc = LogisticRegression()
scaler = StandardScaler().set_output(transform='pandas')

In [183]:
p1 = Pipeline([
    ('scaler', scaler),
    ('gbc', gbc)
])
p2 = Pipeline([
    ('scaler', scaler),
    ('lgc', lgc)
])

voting_classifier = VotingClassifier(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2)
        ],
    voting='soft'
)

voting_classifier

In [184]:
voting_classifier.fit(X_train, y_train)

In [185]:
y_pred = voting_classifier.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.9249999999999999

In [186]:
# XGBoost model을 따로 학습시키고 VotingClassifier에 추가

In [187]:
xgb = XGBClassifier(n_estimators=10000, early_stopping_rounds=10)
xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.47369	validation_1-logloss:0.48629
[1]	validation_0-logloss:0.35799	validation_1-logloss:0.38499
[2]	validation_0-logloss:0.27593	validation_1-logloss:0.32306
[3]	validation_0-logloss:0.21821	validation_1-logloss:0.26930
[4]	validation_0-logloss:0.17378	validation_1-logloss:0.22892
[5]	validation_0-logloss:0.14589	validation_1-logloss:0.20783
[6]	validation_0-logloss:0.12209	validation_1-logloss:0.19011
[7]	validation_0-logloss:0.10632	validation_1-logloss:0.17637
[8]	validation_0-logloss:0.09271	validation_1-logloss:0.16575
[9]	validation_0-logloss:0.08098	validation_1-logloss:0.15818
[10]	validation_0-logloss:0.07135	validation_1-logloss:0.15290
[11]	validation_0-logloss:0.06396	validation_1-logloss:0.14793
[12]	validation_0-logloss:0.05739	validation_1-logloss:0.13669
[13]	validation_0-logloss:0.05237	validation_1-logloss:0.13677
[14]	validation_0-logloss:0.04794	validation_1-logloss:0.12918
[15]	validation_0-logloss:0.04450	validation_1-logloss:0.12895
[1

In [188]:
y_pred = xgb.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.9688581314878894

In [189]:
voting_classifier.estimators.append(
    ['xgb', xgb]
)

In [190]:
voting_classifier

In [191]:
y_pred = voting_classifier.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9249999999999999

In [192]:
import pickle

In [193]:
xgboost.__version__

'2.0.2'

In [194]:
pickle.dump(voting_classifier, open('./test.pkl', 'wb'))

In [195]:
p = pickle.load(open('./test.pkl', 'rb'))
p

# VotingClassifier Override

1. 접근법: VotingClassifier의 fit에 eval_set을 받을 수 있도록 wrapper를 만든다
2. 문제점: VotingClassifier는 eval_set을 parameter로 받지 못한다
    - 참고 코드: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_voting.py
3. 해결법: Voting Classifier를 override한다


In [196]:
import sklearn

sklearn.__version__

'1.2.2'

In [197]:
import numpy as np
import pandas as pd

In [198]:
from sklearn.preprocessing import LabelEncoder

class VotingClassifierWrapper(VotingClassifier):
  def fit(self, X, y, eval_set=None, **params):
    # Following _BaseVoting as a Super Class
    self.le_ = LabelEncoder().fit(y)
    self.classes_ = self.le_.classes_
    transformed_y = self.le_.transform(y)

    if eval_set is not None:
      eval_set = [(X, self.le_.transform(y)) for (X, y) in eval_set]

    estimators_ = {}
    # if class is xgboost, then support eval_set method
    for name, estimator in self.estimators:
      if type(estimator) == xgboost.sklearn.XGBClassifier:
        estimators_[name] = estimator.fit(X, y, eval_set=eval_set, **params)
      else:
        estimators_[name] = estimator.fit(X, transformed_y, **params)

    # fill out attributes
    self.estimators_ = [estimator for (name, estimator) in estimators_.items()]
    self.named_estimators_ = estimators_

    for name, est in self.estimators:
      if hasattr(est, 'feature_names_in_'):
        self.feature_names_in_ = est.feature_names_in_

    return self

In [199]:
voting_classifier

In [200]:
type(X_train) == pd.core.frame.DataFrame

True

In [201]:
vwrap = VotingClassifierWrapper(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2),
        ('xgb', XGBClassifier(n_estimators=500, early_stopping_rounds=10))
        ],
    voting='soft')
vwrap

In [202]:
vwrap.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.47369	validation_1-logloss:0.48629
[1]	validation_0-logloss:0.35799	validation_1-logloss:0.38499
[2]	validation_0-logloss:0.27593	validation_1-logloss:0.32306
[3]	validation_0-logloss:0.21821	validation_1-logloss:0.26930
[4]	validation_0-logloss:0.17378	validation_1-logloss:0.22892
[5]	validation_0-logloss:0.14589	validation_1-logloss:0.20783
[6]	validation_0-logloss:0.12209	validation_1-logloss:0.19011
[7]	validation_0-logloss:0.10632	validation_1-logloss:0.17637
[8]	validation_0-logloss:0.09271	validation_1-logloss:0.16575
[9]	validation_0-logloss:0.08098	validation_1-logloss:0.15818
[10]	validation_0-logloss:0.07135	validation_1-logloss:0.15290
[11]	validation_0-logloss:0.06396	validation_1-logloss:0.14793
[12]	validation_0-logloss:0.05739	validation_1-logloss:0.13669
[13]	validation_0-logloss:0.05237	validation_1-logloss:0.13677
[14]	validation_0-logloss:0.04794	validation_1-logloss:0.12918
[15]	validation_0-logloss:0.04450	validation_1-logloss:0.12895
[1

In [203]:
pickle.dump(vwrap, open('./test.pkl', 'wb'))

In [204]:
p = pickle.load(open('./test.pkl', 'rb'))
p

In [205]:
vwrap.n_features_in_

30

In [206]:
vwrap.feature_names_in_

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [207]:
vwrap.predict(X_test)

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,

In [208]:
vwrap.predict_proba(X_test)[:10]

array([[0.09509189, 0.90490811],
       [0.99220907, 0.00779092],
       [0.98802918, 0.01197083],
       [0.01060427, 0.98939573],
       [0.0028034 , 0.9971966 ],
       [0.99514109, 0.00485891],
       [0.99374744, 0.00625255],
       [0.8541045 , 0.1458955 ],
       [0.83422239, 0.16577761],
       [0.01020755, 0.98979245]])

In [209]:
y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9565217391304347

In [210]:
vwrap.weights = [1, 0, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9153153153153154

In [211]:
vwrap.weights = [1, 0, 3]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9578947368421052

In [212]:
vwrap.weights = [1, 1, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9249999999999999

In [213]:
vwrap.weights = [1, 2, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9793103448275862

In [214]:
vwrap.weights = [1, 1, 3]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9634782608695653

# Override XGBoostClassifier



In [215]:
SklObjective

typing.Union[str, typing.Callable[[numpy.ndarray, numpy.ndarray], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]

In [245]:
from typing import Optional, Any
from xgboost.sklearn import SklObjective, Booster, XGBModel, Metric, TrainingCallback

class FractionSupportXGBClassifier(XGBClassifier):
  def __init__(
      self,
      fraction_ratio: Optional[float] = None,
      **kwargs: Any,
      ) -> None:
      self.fraction_ratio = fraction_ratio
      if (self.fraction_ratio is not None) and \
       (kwargs.get('early_stopping_rounds') is None):
          kwargs['early_stopping_rounds'] = int(kwargs['n_estimators'] / 10) if int(kwargs['n_estimators'] / 10) > 0 else 1
      super().__init__(**kwargs)

  def fit(
      self,
      *args,
      **kwargs
  ) -> "XGBClassifier":
      if self.fraction_ratio is not None:
          X_sample = args[0]
          y_sample = args[1]

          X_train, X_valid, y_train, y_valid = train_test_split(X_sample, y_sample,
                                                                test_size=self.fraction_ratio,
                                                                random_state=self.random_state
                                                                )

          kwargs['eval_set'] = [(X_train, y_train), (X_valid, y_valid)]
      return super().fit(*args, **kwargs)

In [247]:
wrap_xgb = FractionSupportXGBClassifier(n_estimators=10000, fraction_ratio=.2, early_stopping_rounds=10)
wrap_xgb

In [248]:
wrap_xgb.fit(X_train, y_train)

[0]	validation_0-logloss:0.47133	validation_1-logloss:0.48290
[1]	validation_0-logloss:0.35365	validation_1-logloss:0.37498
[2]	validation_0-logloss:0.27213	validation_1-logloss:0.29082
[3]	validation_0-logloss:0.21312	validation_1-logloss:0.23814
[4]	validation_0-logloss:0.16877	validation_1-logloss:0.19340
[5]	validation_0-logloss:0.14053	validation_1-logloss:0.16686
[6]	validation_0-logloss:0.11608	validation_1-logloss:0.14561
[7]	validation_0-logloss:0.09984	validation_1-logloss:0.13166


Parameters: { "fraction_ratio" } are not used.



[8]	validation_0-logloss:0.08622	validation_1-logloss:0.11811
[9]	validation_0-logloss:0.07471	validation_1-logloss:0.10548
[10]	validation_0-logloss:0.06502	validation_1-logloss:0.09611
[11]	validation_0-logloss:0.05798	validation_1-logloss:0.08735
[12]	validation_0-logloss:0.05197	validation_1-logloss:0.07860
[13]	validation_0-logloss:0.04773	validation_1-logloss:0.07053
[14]	validation_0-logloss:0.04380	validation_1-logloss:0.06413
[15]	validation_0-logloss:0.04070	validation_1-logloss:0.05938
[16]	validation_0-logloss:0.03794	validation_1-logloss:0.05489
[17]	validation_0-logloss:0.03554	validation_1-logloss:0.05349
[18]	validation_0-logloss:0.03444	validation_1-logloss:0.05107
[19]	validation_0-logloss:0.03235	validation_1-logloss:0.05000
[20]	validation_0-logloss:0.03160	validation_1-logloss:0.04807
[21]	validation_0-logloss:0.03035	validation_1-logloss:0.04795
[22]	validation_0-logloss:0.02983	validation_1-logloss:0.04638
[23]	validation_0-logloss:0.02876	validation_1-logloss:0.

In [234]:
f1_score(y_true=y_test, y_pred=wrap_xgb.predict(X_test))

0.9601386481802427

In [235]:
vc2 = VotingClassifier(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2),
        ('xgb', FractionSupportXGBClassifier(n_estimators=10000, fraction_ratio=.2, early_stopping_rounds=10))
        ],
    voting='soft')
vc2

In [236]:
vc2.fit(X_train, y_train)

[0]	validation_0-logloss:0.47990	validation_1-logloss:0.44938
[1]	validation_0-logloss:0.36282	validation_1-logloss:0.33910
[2]	validation_0-logloss:0.28076	validation_1-logloss:0.25706
[3]	validation_0-logloss:0.22214	validation_1-logloss:0.20286
[4]	validation_0-logloss:0.17788	validation_1-logloss:0.15776
[5]	validation_0-logloss:0.14949	validation_1-logloss:0.13183
[6]	validation_0-logloss:0.12482	validation_1-logloss:0.11140
[7]	validation_0-logloss:0.10904	validation_1-logloss:0.09569
[8]	validation_0-logloss:0.09615	validation_1-logloss:0.07926
[9]	validation_0-logloss:0.08361	validation_1-logloss:0.07066
[10]	validation_0-logloss:0.07360	validation_1-logloss:0.06255
[11]	validation_0-logloss:0.06641	validation_1-logloss:0.05438
[12]	validation_0-logloss:0.06018	validation_1-logloss:0.04648
[13]	validation_0-logloss:0.05499	validation_1-logloss:0.04213
[14]	validation_0-logloss:0.05063	validation_1-logloss:0.03738
[15]	validation_0-logloss:0.04649	validation_1-logloss:0.03671
[1

Parameters: { "fraction_ratio" } are not used.



[43]	validation_0-logloss:0.02619	validation_1-logloss:0.01733
[44]	validation_0-logloss:0.02589	validation_1-logloss:0.01732
[45]	validation_0-logloss:0.02563	validation_1-logloss:0.01707
[46]	validation_0-logloss:0.02537	validation_1-logloss:0.01691
[47]	validation_0-logloss:0.02508	validation_1-logloss:0.01694
[48]	validation_0-logloss:0.02477	validation_1-logloss:0.01699
[49]	validation_0-logloss:0.02452	validation_1-logloss:0.01685
[50]	validation_0-logloss:0.02426	validation_1-logloss:0.01688
[51]	validation_0-logloss:0.02405	validation_1-logloss:0.01666
[52]	validation_0-logloss:0.02382	validation_1-logloss:0.01667
[53]	validation_0-logloss:0.02357	validation_1-logloss:0.01671
[54]	validation_0-logloss:0.02343	validation_1-logloss:0.01642
[55]	validation_0-logloss:0.02322	validation_1-logloss:0.01648
[56]	validation_0-logloss:0.02304	validation_1-logloss:0.01629
[57]	validation_0-logloss:0.02285	validation_1-logloss:0.01636
[58]	validation_0-logloss:0.02271	validation_1-logloss:

In [177]:
f1_score(y_true=y_test, y_pred=vc2.predict(X_test))

0.958041958041958