<a href="https://colab.research.google.com/github/JaehyunAhn/AI_for_Education/blob/master/VotingClassifier_ensemble_with_earlystopping_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xgboost==1.6.1
!pip install --upgrade scikit-learn

Collecting xgboost==1.6.1
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.9/192.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.1
    Uninstalling xgboost-2.0.1:
      Successfully uninstalled xgboost-2.0.1
Successfully installed xgboost-1.6.1
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [1]:
import xgboost
from sklearn.metrics import f1_score

xgboost.__version__

'2.0.2'

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Split your data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [3]:
gbc = GradientBoostingClassifier(n_estimators=100, validation_fraction=.2)
lgc = LogisticRegression()
scaler = StandardScaler().set_output(transform='pandas')

In [4]:
p1 = Pipeline([
    ('scaler', scaler),
    ('gbc', gbc)
])
p2 = Pipeline([
    ('scaler', scaler),
    ('lgc', lgc)
])

voting_classifier = VotingClassifier(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2)
        ],
    voting='soft'
)

voting_classifier

In [5]:
voting_classifier.fit(X_train, y_train)

In [6]:
y_pred = voting_classifier.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.9269162210338682

In [7]:
# XGBoost model을 따로 학습시키고 VotingClassifier에 추가

In [8]:
xgb = XGBClassifier(n_estimators=10000, early_stopping_rounds=10)
xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.47369	validation_1-logloss:0.48629
[1]	validation_0-logloss:0.35799	validation_1-logloss:0.38499
[2]	validation_0-logloss:0.27593	validation_1-logloss:0.32306
[3]	validation_0-logloss:0.21821	validation_1-logloss:0.26930
[4]	validation_0-logloss:0.17378	validation_1-logloss:0.22892
[5]	validation_0-logloss:0.14589	validation_1-logloss:0.20783
[6]	validation_0-logloss:0.12209	validation_1-logloss:0.19011
[7]	validation_0-logloss:0.10632	validation_1-logloss:0.17637
[8]	validation_0-logloss:0.09271	validation_1-logloss:0.16575
[9]	validation_0-logloss:0.08098	validation_1-logloss:0.15818
[10]	validation_0-logloss:0.07135	validation_1-logloss:0.15290
[11]	validation_0-logloss:0.06396	validation_1-logloss:0.14793
[12]	validation_0-logloss:0.05739	validation_1-logloss:0.13669
[13]	validation_0-logloss:0.05237	validation_1-logloss:0.13677
[14]	validation_0-logloss:0.04794	validation_1-logloss:0.12918
[15]	validation_0-logloss:0.04450	validation_1-logloss:0.12895
[1

In [9]:
y_pred = xgb.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.9688581314878894

In [10]:
voting_classifier.estimators.append(
    ['xgb', xgb]
)

In [11]:
voting_classifier

In [12]:
y_pred = voting_classifier.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9269162210338682

In [13]:
import pickle

In [14]:
xgboost.__version__

'2.0.2'

In [15]:
pickle.dump(voting_classifier, open('./test.pkl', 'wb'))

In [16]:
p = pickle.load(open('./test.pkl', 'rb'))
p

# VotingClassifier Override

1. 접근법: VotingClassifier의 fit에 eval_set을 받을 수 있도록 wrapper를 만든다
2. 문제점: VotingClassifier는 eval_set을 parameter로 받지 못한다
    - 참고 코드: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_voting.py
3. 해결법: Voting Classifier를 override한다


In [17]:
import sklearn

sklearn.__version__

'1.2.2'

In [18]:
import numpy as np
import pandas as pd

In [19]:
from sklearn.preprocessing import LabelEncoder

class VotingClassifierWrapper(VotingClassifier):
  def fit(self, X, y, eval_set=None, **params):
    # Following _BaseVoting as a Super Class
    self.le_ = LabelEncoder().fit(y)
    self.classes_ = self.le_.classes_
    transformed_y = self.le_.transform(y)

    if eval_set is not None:
      eval_set = [(X, self.le_.transform(y)) for (X, y) in eval_set]

    estimators_ = {}
    # if class is xgboost, then support eval_set method
    for name, estimator in self.estimators:
      if type(estimator) == xgboost.sklearn.XGBClassifier:
        estimators_[name] = estimator.fit(X, y, eval_set=eval_set, **params)
      else:
        estimators_[name] = estimator.fit(X, transformed_y, **params)

    # fill out attributes
    self.estimators_ = [estimator for (name, estimator) in estimators_.items()]
    self.named_estimators_ = estimators_

    for name, est in self.estimators:
      if hasattr(est, 'feature_names_in_'):
        self.feature_names_in_ = est.feature_names_in_

    return self

In [20]:
voting_classifier

In [21]:
type(X_train) == pd.core.frame.DataFrame

True

In [23]:
vwrap = VotingClassifierWrapper(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2),
        ('xgb', XGBClassifier(n_estimators=500, early_stopping_rounds=10))
        ],
    voting='soft')
vwrap

In [24]:
vwrap.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.47369	validation_1-logloss:0.48629
[1]	validation_0-logloss:0.35799	validation_1-logloss:0.38499
[2]	validation_0-logloss:0.27593	validation_1-logloss:0.32306
[3]	validation_0-logloss:0.21821	validation_1-logloss:0.26930
[4]	validation_0-logloss:0.17378	validation_1-logloss:0.22892
[5]	validation_0-logloss:0.14589	validation_1-logloss:0.20783
[6]	validation_0-logloss:0.12209	validation_1-logloss:0.19011
[7]	validation_0-logloss:0.10632	validation_1-logloss:0.17637
[8]	validation_0-logloss:0.09271	validation_1-logloss:0.16575
[9]	validation_0-logloss:0.08098	validation_1-logloss:0.15818
[10]	validation_0-logloss:0.07135	validation_1-logloss:0.15290
[11]	validation_0-logloss:0.06396	validation_1-logloss:0.14793
[12]	validation_0-logloss:0.05739	validation_1-logloss:0.13669
[13]	validation_0-logloss:0.05237	validation_1-logloss:0.13677
[14]	validation_0-logloss:0.04794	validation_1-logloss:0.12918
[15]	validation_0-logloss:0.04450	validation_1-logloss:0.12895
[1

In [25]:
pickle.dump(vwrap, open('./test.pkl', 'wb'))

In [26]:
p = pickle.load(open('./test.pkl', 'rb'))
p

In [27]:
vwrap.n_features_in_

30

In [28]:
vwrap.feature_names_in_

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [29]:
vwrap.predict(X_test)

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,

In [30]:
vwrap.predict_proba(X_test)[:10]

array([[0.09509189, 0.90490811],
       [0.99220907, 0.00779092],
       [0.98802918, 0.01197083],
       [0.01060427, 0.98939573],
       [0.0028034 , 0.9971966 ],
       [0.99514109, 0.00485891],
       [0.99374744, 0.00625255],
       [0.8541045 , 0.1458955 ],
       [0.83424366, 0.16575634],
       [0.01020755, 0.98979245]])

In [31]:
y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9565217391304347

In [32]:
vwrap.weights = [1, 0, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9194991055456171

In [33]:
vwrap.weights = [1, 0, 3]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9597197898423817

In [34]:
vwrap.weights = [1, 1, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9249999999999999

In [35]:
vwrap.weights = [1, 2, 0]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9793103448275862

In [36]:
vwrap.weights = [1, 1, 3]

y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9652777777777778

# Override XGBoostClassifier



In [70]:
SklObjective

typing.Union[str, typing.Callable[[numpy.ndarray, numpy.ndarray], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]

In [168]:
from typing import Optional, Any
from xgboost.sklearn import SklObjective, Booster, XGBModel, Metric, TrainingCallback

class FractionSupportXGBClassifier(XGBClassifier):
  def __init__(
      self,
      fraction_ratio: Optional[float] = None,
      **kwargs: Any,
      ) -> None:
      self.fraction_ratio = fraction_ratio
      if (self.fraction_ratio is not None) and \
       (kwargs.get('early_stopping_rounds') is None):
          kwargs['early_stopping_rounds'] = int(kwargs['n_estimators'] / 10) if int(kwargs['n_estimators'] / 10) > 0 else 1
      super().__init__(**kwargs)

  def fit(
      self,
      *args,
      **kwargs
  ) -> "XGBClassifier":
      if self.fraction_ratio is not None:
          X_sample = args[0]
          y_sample = args[1]

          X_train, X_valid, y_train, y_valid = train_test_split(X_sample, y_sample, test_size=self.fraction_ratio)

          kwargs['eval_set'] = [(X_train, y_train), (X_valid, y_valid)]
      return super().fit(*args, **kwargs)

In [170]:
wrap_xgb = FractionSupportXGBClassifier(n_estimators=10000, fraction_ratio=.2, early_stopping_rounds=100)
wrap_xgb

In [171]:
wrap_xgb.fit(X_train, y_train)

Parameters: { "fraction_ratio" } are not used.



[0]	validation_0-logloss:0.47134	validation_1-logloss:0.48286
[1]	validation_0-logloss:0.35408	validation_1-logloss:0.37328
[2]	validation_0-logloss:0.27180	validation_1-logloss:0.29212
[3]	validation_0-logloss:0.21456	validation_1-logloss:0.23252
[4]	validation_0-logloss:0.17048	validation_1-logloss:0.18671
[5]	validation_0-logloss:0.14284	validation_1-logloss:0.15786
[6]	validation_0-logloss:0.11815	validation_1-logloss:0.13747
[7]	validation_0-logloss:0.10128	validation_1-logloss:0.12605
[8]	validation_0-logloss:0.08867	validation_1-logloss:0.10853
[9]	validation_0-logloss:0.07770	validation_1-logloss:0.09380
[10]	validation_0-logloss:0.06820	validation_1-logloss:0.08366
[11]	validation_0-logloss:0.06182	validation_1-logloss:0.07230
[12]	validation_0-logloss:0.05539	validation_1-logloss:0.06521
[13]	validation_0-logloss:0.05024	validation_1-logloss:0.06070
[14]	validation_0-logloss:0.04603	validation_1-logloss:0.05537
[15]	validation_0-logloss:0.04331	validation_1-logloss:0.04918
[1

In [172]:
f1_score(y_true=y_test, y_pred=wrap_xgb.predict(X_test))

0.9618055555555556

In [175]:
vc2 = VotingClassifier(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2),
        ('xgb', FractionSupportXGBClassifier(n_estimators=10000, fraction_ratio=.2, early_stopping_rounds=10))
        ],
    voting='soft')
vc2

In [176]:
vc2.fit(X_train, y_train)

[0]	validation_0-logloss:0.47153	validation_1-logloss:0.48214
[1]	validation_0-logloss:0.36128	validation_1-logloss:0.34513
[2]	validation_0-logloss:0.28144	validation_1-logloss:0.25438
[3]	validation_0-logloss:0.22450	validation_1-logloss:0.19361
[4]	validation_0-logloss:0.17983	validation_1-logloss:0.15013
[5]	validation_0-logloss:0.15225	validation_1-logloss:0.12101
[6]	validation_0-logloss:0.12784	validation_1-logloss:0.09957
[7]	validation_0-logloss:0.11123	validation_1-logloss:0.08710
[8]	validation_0-logloss:0.09797	validation_1-logloss:0.07212
[9]	validation_0-logloss:0.08610	validation_1-logloss:0.06094
[10]	validation_0-logloss:0.07643	validation_1-logloss:0.05145
[11]	validation_0-logloss:0.06825	validation_1-logloss:0.04714
[12]	validation_0-logloss:0.06160	validation_1-logloss:0.04092
[13]	validation_0-logloss:0.05595	validation_1-logloss:0.03835
[14]	validation_0-logloss:0.05145	validation_1-logloss:0.03417
[15]	validation_0-logloss:0.04793	validation_1-logloss:0.03109
[1

Parameters: { "fraction_ratio" } are not used.



[46]	validation_0-logloss:0.02583	validation_1-logloss:0.01511
[47]	validation_0-logloss:0.02571	validation_1-logloss:0.01448
[48]	validation_0-logloss:0.02542	validation_1-logloss:0.01442
[49]	validation_0-logloss:0.02508	validation_1-logloss:0.01463
[50]	validation_0-logloss:0.02498	validation_1-logloss:0.01406
[51]	validation_0-logloss:0.02470	validation_1-logloss:0.01414
[52]	validation_0-logloss:0.02449	validation_1-logloss:0.01404
[53]	validation_0-logloss:0.02439	validation_1-logloss:0.01351
[54]	validation_0-logloss:0.02412	validation_1-logloss:0.01374
[55]	validation_0-logloss:0.02404	validation_1-logloss:0.01325
[56]	validation_0-logloss:0.02373	validation_1-logloss:0.01360
[57]	validation_0-logloss:0.02368	validation_1-logloss:0.01314
[58]	validation_0-logloss:0.02340	validation_1-logloss:0.01347
[59]	validation_0-logloss:0.02321	validation_1-logloss:0.01355
[60]	validation_0-logloss:0.02315	validation_1-logloss:0.01309
[61]	validation_0-logloss:0.02314	validation_1-logloss:

In [177]:
f1_score(y_true=y_test, y_pred=vc2.predict(X_test))

0.958041958041958