In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from target_encoding import TargetEncoder
from target_encoding import TargetEncoderClassifier

In [2]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

In [3]:
enc = TargetEncoder()
new_X_train = enc.transform_train(X_train, y_train)
new_X_test = enc.transform_test(X_test)

In [4]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_test)[:, 1]
print(f'without target encoding {roc_auc_score(y_test, pred):.4f}')

without target encoding 0.9953


In [5]:
rf.fit(new_X_train, y_train)
pred = rf.predict_proba(new_X_test)[:, 1]
print(f'with target encoding {roc_auc_score(y_test, pred):.4f}')

with target encoding 0.9993


In [6]:
enc = TargetEncoderClassifier()
enc.fit(X_train, y_train)
pred = enc.predict_proba(X_test)[:, 1]
print(f'target encoding classifier {roc_auc_score(y_test, pred):.4f}')

target encoding classifier 0.9974
