In [4]:
!pip install xgboost==1.6.1
!pip install --upgrade scikit-learn

Collecting xgboost==1.6.1
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.9/192.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.1
    Uninstalling xgboost-2.0.1:
      Successfully uninstalled xgboost-2.0.1
Successfully installed xgboost-1.6.1
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [1]:
import xgboost
from sklearn.metrics import f1_score

xgboost.__version__

'1.6.1'

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


X, y = load_breast_cancer(return_X_y=True, as_frame=True)

# Split your data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [3]:
gbc = GradientBoostingClassifier(n_estimators=100, validation_fraction=.2)
lgc = LogisticRegression()
scaler = StandardScaler().set_output(transform='pandas')

In [4]:
p1 = Pipeline([
    ('scaler', scaler),
    ('gbc', gbc)
])
p2 = Pipeline([
    ('scaler', scaler),
    ('lgc', lgc)
])

voting_classifier = VotingClassifier(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2)
        ],
    voting='soft'
)

voting_classifier

In [5]:
voting_classifier.fit(X_train, y_train)

In [6]:
y_pred = voting_classifier.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.9249999999999999

In [7]:
# XGBoost model을 따로 학습시키고 VotingClassifier에 추가

In [8]:
xgb = XGBClassifier(n_estimators=10000, early_stopping_rounds=10)
xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.48827	validation_1-logloss:0.50904
[1]	validation_0-logloss:0.36871	validation_1-logloss:0.40246
[2]	validation_0-logloss:0.28175	validation_1-logloss:0.32492
[3]	validation_0-logloss:0.22129	validation_1-logloss:0.27877
[4]	validation_0-logloss:0.17736	validation_1-logloss:0.23927
[5]	validation_0-logloss:0.14292	validation_1-logloss:0.21536
[6]	validation_0-logloss:0.12141	validation_1-logloss:0.20043
[7]	validation_0-logloss:0.10340	validation_1-logloss:0.18719
[8]	validation_0-logloss:0.08976	validation_1-logloss:0.16944
[9]	validation_0-logloss:0.08006	validation_1-logloss:0.16432
[10]	validation_0-logloss:0.06924	validation_1-logloss:0.15378
[11]	validation_0-logloss:0.06167	validation_1-logloss:0.14898
[12]	validation_0-logloss:0.05527	validation_1-logloss:0.14469
[13]	validation_0-logloss:0.04996	validation_1-logloss:0.14074
[14]	validation_0-logloss:0.04511	validation_1-logloss:0.13848
[15]	validation_0-logloss:0.04213	validation_1-logloss:0.13854
[1

In [9]:
y_pred = xgb.predict(X_test)

f1_score(y_true=y_test, y_pred=y_pred)

0.9636048526863086

In [10]:
voting_classifier.estimators.append(
    ['xgb', xgb]
)

In [11]:
voting_classifier

In [12]:
y_pred = voting_classifier.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9249999999999999

In [13]:
import pickle

In [14]:
xgboost.__version__

'1.6.1'

In [17]:
pickle.dump(voting_classifier, open('./test.pkl', 'wb'))

In [18]:
p = pickle.load(open('./test.pkl', 'rb'))
p

# Metadata Request

1. 접근법: VotingClassifier의 fit에 eval_set을 받을 수 있도록 wrapper를 만든다
2. 문제점: VotingClassifier는 eval_set을 parameter로 받지 못한다
    - 참고 코드: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_voting.py
3. 해결법: VotingClassifier에 eval_set을 받을 수 있도록 metadata routing을 만든다


In [207]:
import sklearn

sklearn.__version__

'1.3.2'

In [135]:
import numpy as np

In [260]:
from sklearn.preprocessing import LabelEncoder

class VotingClassifierWrapper(VotingClassifier):
  def fit(self, X, y, eval_set=None, **params):
    self.le_ = LabelEncoder().fit(y)
    self.classes_ = self.le_.classes_
    transformed_y = self.le_.transform(y)

    if eval_set is not None:
      eval_set = [(X, self.le_.transform(y)) for (X, y) in eval_set]

    estimators_ = []
    # if class is xgboost, then support eval_set method
    for name, estimator in self.estimators:
      if type(estimator) == xgboost.sklearn.XGBClassifier:
        estimators_.append(estimator.fit(X, y, eval_set=eval_set, **params))
      else:
        estimators_.append(estimator.fit(X, transformed_y, **params))
    self.estimators_ = estimators_

    return self

In [261]:
voting_classifier

In [262]:
vwrap = VotingClassifierWrapper(
    estimators=[
        ('pipe1', p1),
        ('pipe2', p2),
        ('xgb', XGBClassifier(n_estimators=500, early_stopping_rounds=10))
        ],
    voting='soft')
vwrap

In [264]:
vwrap.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-logloss:0.48827	validation_1-logloss:0.50904
[1]	validation_0-logloss:0.36871	validation_1-logloss:0.40246
[2]	validation_0-logloss:0.28175	validation_1-logloss:0.32492
[3]	validation_0-logloss:0.22129	validation_1-logloss:0.27877
[4]	validation_0-logloss:0.17736	validation_1-logloss:0.23927
[5]	validation_0-logloss:0.14292	validation_1-logloss:0.21536
[6]	validation_0-logloss:0.12141	validation_1-logloss:0.20043
[7]	validation_0-logloss:0.10340	validation_1-logloss:0.18719
[8]	validation_0-logloss:0.08976	validation_1-logloss:0.16944
[9]	validation_0-logloss:0.08006	validation_1-logloss:0.16432
[10]	validation_0-logloss:0.06924	validation_1-logloss:0.15378
[11]	validation_0-logloss:0.06167	validation_1-logloss:0.14898
[12]	validation_0-logloss:0.05527	validation_1-logloss:0.14469
[13]	validation_0-logloss:0.04996	validation_1-logloss:0.14074
[14]	validation_0-logloss:0.04511	validation_1-logloss:0.13848
[15]	validation_0-logloss:0.04213	validation_1-logloss:0.13854
[1

In [265]:
vwrap.predict(X_test)

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,

In [267]:
vwrap.predict_proba(X_test)[:10]

array([[0.08579401, 0.91420599],
       [0.99433745, 0.00566254],
       [0.98912227, 0.01087773],
       [0.01026055, 0.98973945],
       [0.00279072, 0.99720928],
       [0.99625228, 0.00374772],
       [0.99474167, 0.00525833],
       [0.85631017, 0.14368984],
       [0.85055272, 0.14944728],
       [0.00879647, 0.99120353]])

In [268]:
y_pred = vwrap.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.9547038327526133