<a href="https://colab.research.google.com/github/JaehyunAhn/AI_for_Education/blob/master/VotingClassifier_ensemble_with_earlystopping_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xgboost==1.6.1
!pip install --upgrade scikit-learn

Collecting xgboost==1.6.1
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.9/192.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.1
    Uninstalling xgboost-2.0.1:
      Successfully uninstalled xgboost-2.0.1
Successfully installed xgboost-1.6.1
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [2]:
import xgboost
from sklearn.metrics import f1_score

xgboost.__version__

'2.0.2'

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Split your data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [4]:
gbc = GradientBoostingClassifier(n_estimators=100, validation_fraction=.2)
lgc = LogisticRegression()
scaler = StandardScaler().set_output(transform='pandas')

In [5]:
p1 = Pipeline([
    ('scaler', scaler),
    ('gbc', gbc)
])
p2 = Pipeline([
    ('scaler', scaler),
    ('lgc', lgc)
])

voting_classifier = VotingClassifier(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2)
        ],
    voting='soft'
)

voting_classifier

In [6]:
voting_classifier.fit(X_train, y_train)

In [7]:
y_pred = voting_classifier.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.923076923076923

In [8]:
# XGBoost model을 따로 학습시키고 VotingClassifier에 추가

In [9]:
xgb = XGBClassifier(n_estimators=10000, early_stopping_rounds=10)
xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.47369	validation_1-logloss:0.48629
[1]	validation_0-logloss:0.35799	validation_1-logloss:0.38499
[2]	validation_0-logloss:0.27593	validation_1-logloss:0.32306
[3]	validation_0-logloss:0.21821	validation_1-logloss:0.26930
[4]	validation_0-logloss:0.17378	validation_1-logloss:0.22892
[5]	validation_0-logloss:0.14589	validation_1-logloss:0.20783
[6]	validation_0-logloss:0.12209	validation_1-logloss:0.19011
[7]	validation_0-logloss:0.10632	validation_1-logloss:0.17637
[8]	validation_0-logloss:0.09271	validation_1-logloss:0.16575
[9]	validation_0-logloss:0.08098	validation_1-logloss:0.15818
[10]	validation_0-logloss:0.07135	validation_1-logloss:0.15290
[11]	validation_0-logloss:0.06396	validation_1-logloss:0.14793
[12]	validation_0-logloss:0.05739	validation_1-logloss:0.13669
[13]	validation_0-logloss:0.05237	validation_1-logloss:0.13677
[14]	validation_0-logloss:0.04794	validation_1-logloss:0.12918
[15]	validation_0-logloss:0.04450	validation_1-logloss:0.12895
[1

In [10]:
y_pred = xgb.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.9688581314878894

In [11]:
voting_classifier.estimators.append(
    ['xgb', xgb]
)

In [12]:
voting_classifier

In [13]:
y_pred = voting_classifier.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.923076923076923

In [14]:
import pickle

In [15]:
xgboost.__version__

'2.0.2'

In [16]:
pickle.dump(voting_classifier, open('./test.pkl', 'wb'))

In [17]:
p = pickle.load(open('./test.pkl', 'rb'))
p

# VotingClassifier Override

1. 접근법: VotingClassifier의 fit에 eval_set을 받을 수 있도록 wrapper를 만든다
2. 문제점: VotingClassifier는 eval_set을 parameter로 받지 못한다
    - 참고 코드: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_voting.py
3. 해결법: Voting Classifier를 override한다


In [18]:
import sklearn

sklearn.__version__

'1.2.2'

In [19]:
import numpy as np
import pandas as pd

In [20]:
from sklearn.preprocessing import LabelEncoder

class VotingClassifierWrapper(VotingClassifier):
  def fit(self, X, y, eval_set=None, **params):
    # Following _BaseVoting as a Super Class
    self.le_ = LabelEncoder().fit(y)
    self.classes_ = self.le_.classes_
    transformed_y = self.le_.transform(y)

    if eval_set is not None:
      eval_set = [(X, self.le_.transform(y)) for (X, y) in eval_set]

    estimators_ = {}
    # if class is xgboost, then support eval_set method
    for name, estimator in self.estimators:
      if type(estimator) == xgboost.sklearn.XGBClassifier:
        estimators_[name] = estimator.fit(X, y, eval_set=eval_set, **params)
      else:
        estimators_[name] = estimator.fit(X, transformed_y, **params)

    # fill out attributes
    self.estimators_ = [estimator for (name, estimator) in estimators_.items()]
    self.named_estimators_ = estimators_

    for name, est in self.estimators:
      if hasattr(est, 'feature_names_in_'):
        self.feature_names_in_ = est.feature_names_in_

    return self

In [21]:
voting_classifier

In [22]:
type(X_train) == pd.core.frame.DataFrame

True

In [23]:
vwrap = VotingClassifierWrapper(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2),
        ('xgb', XGBClassifier(n_estimators=500, early_stopping_rounds=10))
        ],
    voting='soft')
vwrap

In [24]:
vwrap.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.47369	validation_1-logloss:0.48629
[1]	validation_0-logloss:0.35799	validation_1-logloss:0.38499
[2]	validation_0-logloss:0.27593	validation_1-logloss:0.32306
[3]	validation_0-logloss:0.21821	validation_1-logloss:0.26930
[4]	validation_0-logloss:0.17378	validation_1-logloss:0.22892
[5]	validation_0-logloss:0.14589	validation_1-logloss:0.20783
[6]	validation_0-logloss:0.12209	validation_1-logloss:0.19011
[7]	validation_0-logloss:0.10632	validation_1-logloss:0.17637
[8]	validation_0-logloss:0.09271	validation_1-logloss:0.16575
[9]	validation_0-logloss:0.08098	validation_1-logloss:0.15818
[10]	validation_0-logloss:0.07135	validation_1-logloss:0.15290
[11]	validation_0-logloss:0.06396	validation_1-logloss:0.14793
[12]	validation_0-logloss:0.05739	validation_1-logloss:0.13669
[13]	validation_0-logloss:0.05237	validation_1-logloss:0.13677
[14]	validation_0-logloss:0.04794	validation_1-logloss:0.12918
[15]	validation_0-logloss:0.04450	validation_1-logloss:0.12895
[1

In [25]:
pickle.dump(vwrap, open('./test.pkl', 'wb'))

In [26]:
p = pickle.load(open('./test.pkl', 'rb'))
p

In [27]:
vwrap.n_features_in_

30

In [28]:
vwrap.feature_names_in_

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [29]:
vwrap.predict(X_test)

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,

In [30]:
vwrap.predict_proba(X_test)[:10]

array([[0.09509189, 0.90490811],
       [0.99220907, 0.00779092],
       [0.98802918, 0.01197083],
       [0.01060427, 0.98939573],
       [0.0028034 , 0.9971966 ],
       [0.99514109, 0.00485891],
       [0.99374744, 0.00625255],
       [0.8541045 , 0.1458955 ],
       [0.8341843 , 0.16581571],
       [0.01020755, 0.98979245]])

In [31]:
y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9547038327526133

In [32]:
vwrap.weights = [1, 0, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9133574007220217

In [33]:
vwrap.weights = [1, 0, 3]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9578947368421052

In [34]:
vwrap.weights = [1, 1, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.923076923076923

In [35]:
vwrap.weights = [1, 2, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9793103448275862

In [36]:
vwrap.weights = [1, 1, 3]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9634782608695653

# Override XGBoostClassifier



In [37]:
SklObjective

typing.Union[str, typing.Callable[[numpy.ndarray, numpy.ndarray], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]

In [85]:
from multiprocessing import Value
from typing import Optional, Any
from xgboost.sklearn import SklObjective, Booster, XGBModel, Metric, TrainingCallback


class FractionSupportXGBClassifier(XGBClassifier):
  def __init__(
      self,
      fraction_ratio: float | None = None,
      **kwargs: Any,
      ) -> None:
      # check types
      self.fraction_ratio = self._float_checker(fraction_ratio)

      # assign early stopping rounds
      if isinstance(self.fraction_ratio, (float)) and \
       (kwargs.get('early_stopping_rounds') is None):
          kwargs['early_stopping_rounds'] = int(kwargs['n_estimators'] / 10) if int(kwargs['n_estimators'] / 10) > 0 else 1
      super().__init__(**kwargs)


  def fit(
      self,
      *args,
      **kwargs
  ) -> "XGBClassifier":
      if isinstance(self.fraction_ratio, float):
          X_sample = args[0]
          y_sample = args[1]

          X_train, X_valid, y_train, y_valid = train_test_split(X_sample, y_sample,
                                                                test_size=self.fraction_ratio,
                                                                random_state=self.random_state
                                                                )

          kwargs['eval_set'] = [(X_train, y_train), (X_valid, y_valid)]
      return super().fit(*args, **kwargs)


  @staticmethod
  def _float_checker(value: float | None = None):
      """
         Class instance를 바꾸지 않는 메소드는 staticmethod로 선언, self를 선언하지 않음.
      """
      if type(value) is None:
          return value
      if isinstance(value, (float)):
          if (value > 0) and (1 > value):
              return value
          else:
              raise ValueError(f'fraction_ratio={value} must be a floating number between 0 and 1.')
      else:
          raise TypeError(f'fraction_ratio={value} must be a floating number between 0 and 1.')


In [86]:
FractionSupportXGBClassifier(n_estimators=10000, fraction_ratio=0.7, early_stopping_rounds=10)

In [87]:
wrap_xgb = FractionSupportXGBClassifier(n_estimators=10000, fraction_ratio=.2, early_stopping_rounds=10)
wrap_xgb

In [88]:
wrap_xgb.fit(X_train, y_train)

Parameters: { "fraction_ratio" } are not used.



[0]	validation_0-logloss:0.47688	validation_1-logloss:0.46119
[1]	validation_0-logloss:0.36071	validation_1-logloss:0.34735
[2]	validation_0-logloss:0.27891	validation_1-logloss:0.26430
[3]	validation_0-logloss:0.22169	validation_1-logloss:0.20460
[4]	validation_0-logloss:0.17518	validation_1-logloss:0.16831
[5]	validation_0-logloss:0.14660	validation_1-logloss:0.14313
[6]	validation_0-logloss:0.12299	validation_1-logloss:0.11857
[7]	validation_0-logloss:0.10707	validation_1-logloss:0.10338
[8]	validation_0-logloss:0.09346	validation_1-logloss:0.08977
[9]	validation_0-logloss:0.08133	validation_1-logloss:0.07960
[10]	validation_0-logloss:0.07115	validation_1-logloss:0.07211
[11]	validation_0-logloss:0.06413	validation_1-logloss:0.06327
[12]	validation_0-logloss:0.05751	validation_1-logloss:0.05693
[13]	validation_0-logloss:0.05328	validation_1-logloss:0.04880
[14]	validation_0-logloss:0.04859	validation_1-logloss:0.04537
[15]	validation_0-logloss:0.04544	validation_1-logloss:0.04084
[1

In [89]:
f1_score(y_true=y_test, y_pred=wrap_xgb.predict(X_test))

0.9618055555555556

In [90]:
vc2 = VotingClassifier(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2),
        ('xgb', FractionSupportXGBClassifier(n_estimators=10000, fraction_ratio=.2, early_stopping_rounds=10))
        ],
    voting='soft')
vc2

In [91]:
vc2.fit(X_train, y_train)

[0]	validation_0-logloss:0.47235	validation_1-logloss:0.47894
[1]	validation_0-logloss:0.35495	validation_1-logloss:0.36987
[2]	validation_0-logloss:0.27079	validation_1-logloss:0.29608
[3]	validation_0-logloss:0.21498	validation_1-logloss:0.23087
[4]	validation_0-logloss:0.17167	validation_1-logloss:0.18205
[5]	validation_0-logloss:0.14243	validation_1-logloss:0.15944
[6]	validation_0-logloss:0.11823	validation_1-logloss:0.13719
[7]	validation_0-logloss:0.10197	validation_1-logloss:0.12334
[8]	validation_0-logloss:0.08824	validation_1-logloss:0.11020
[9]	validation_0-logloss:0.07723	validation_1-logloss:0.09562
[10]	validation_0-logloss:0.06898	validation_1-logloss:0.08059
[11]	validation_0-logloss:0.06228	validation_1-logloss:0.07051
[12]	validation_0-logloss:0.05668	validation_1-logloss:0.06014
[13]	validation_0-logloss:0.05061	validation_1-logloss:0.05925
[14]	validation_0-logloss:0.04706	validation_1-logloss:0.05137
[15]	validation_0-logloss:0.04336	validation_1-logloss:0.04898
[1

Parameters: { "fraction_ratio" } are not used.



[37]	validation_0-logloss:0.02574	validation_1-logloss:0.02769
[38]	validation_0-logloss:0.02534	validation_1-logloss:0.02766
[39]	validation_0-logloss:0.02502	validation_1-logloss:0.02741
[40]	validation_0-logloss:0.02475	validation_1-logloss:0.02702
[41]	validation_0-logloss:0.02448	validation_1-logloss:0.02665
[42]	validation_0-logloss:0.02413	validation_1-logloss:0.02663
[43]	validation_0-logloss:0.02387	validation_1-logloss:0.02640
[44]	validation_0-logloss:0.02357	validation_1-logloss:0.02639
[45]	validation_0-logloss:0.02344	validation_1-logloss:0.02564
[46]	validation_0-logloss:0.02334	validation_1-logloss:0.02484
[47]	validation_0-logloss:0.02306	validation_1-logloss:0.02486
[48]	validation_0-logloss:0.02284	validation_1-logloss:0.02454
[49]	validation_0-logloss:0.02250	validation_1-logloss:0.02473
[50]	validation_0-logloss:0.02225	validation_1-logloss:0.02476
[51]	validation_0-logloss:0.02210	validation_1-logloss:0.02430
[52]	validation_0-logloss:0.02193	validation_1-logloss:

In [92]:
f1_score(y_true=y_test, y_pred=vc2.predict(X_test))

0.9598603839441536