In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, roc_curve,confusion_matrix, classification_report)

In [2]:
bc = load_breast_cancer()
X = pd.DataFrame(bc.data, columns=bc.feature_names)   
y = pd.Series(bc.target, name="target")              

print("Shape:", X.shape)


Shape: (569, 30)


In [4]:
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.40, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

In [5]:
num_pipe = Pipeline([("imp", SimpleImputer(strategy="median"))])
cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])
pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, selector(dtype_include=np.number)),
        ("cat", cat_pipe, selector(dtype_exclude=np.number)),
    ],
    remainder="drop"
)

In [6]:
base = Pipeline([
    ("pre", pre),
    ("clf", RandomForestClassifier(
        random_state=42, n_jobs=-1, bootstrap=True
    ))
])
base.fit(X_train, y_train)
y_val_pred = base.predict(X_val)
y_val_proba = base.predict_proba(X_val)[:, 0]  # lớp 0 = malignant

print("\n== Baseline on VAL ==")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("F1(malignant=pos):", f1_score(y_val, y_val_pred, pos_label=0))
print("ROC-AUC:", roc_auc_score(y_val, y_val_proba))


== Baseline on VAL ==
Accuracy: 0.9649122807017544
F1(malignant=pos): 0.9512195121951219
ROC-AUC: 0.0033068783068783076


In [None]:
pipe = Pipeline([
    ("pre", pre),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid = {
    "clf__n_estimators": [200, 400, 800],
    "clf__criterion": ["gini", "entropy"],       # (ổn định trên mọi phiên bản)
    "clf__max_depth": [None, 6, 10, 14],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2", 0.5],  # 0.5 = 50% số feature
    "clf__bootstrap": [True, False],
    "clf__class_weight": [None, "balanced"],     # đề phòng lệch lớp
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring={"roc_auc": "roc_auc", "f1": "f1", "accuracy": "accuracy"},
    refit="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

print("\nBest params:", grid.best_params_)
best = grid.best_estimator_

y_val_pred = best.predict(X_val)
y_val_proba = best.predict_proba(X_val)[:, 0]

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("F1(malignant=pos):", f1_score(y_val, y_val_pred, pos_label=0))
print("ROC-AUC:", roc_auc_score(y_val, y_val_proba))

Fitting 5 folds for each of 2592 candidates, totalling 12960 fits

Best params: {'clf__bootstrap': True, 'clf__class_weight': 'balanced', 'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__n_estimators': 400}

== Tuned model on VAL ==
Accuracy: 0.9649122807017544
F1(malignant=pos): 0.9512195121951219
ROC-AUC: 0.0036375661375661382


In [12]:
X_trainval = pd.concat([X_train, X_val], axis=0)
y_trainval = pd.concat([y_train, y_val], axis=0)

final_model = grid.best_estimator_
final_model.fit(X_trainval, y_trainval)

y_test_pred  = final_model.predict(X_test)
y_test_proba = final_model.predict_proba(X_test)[:, 0]

print("\nTestset:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("F1(malignant=pos):", f1_score(y_test, y_test_pred, pos_label=0))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba))

print("\nClassification report (TEST):")
print(classification_report(y_test, y_test_pred, target_names=bc.target_names))

cm = confusion_matrix(y_test, y_test_pred, labels=[0,1])
print("Confusion matrix [[TP_mal, FP_mal],[FN_mal, TN_mal]]:\n", cm)


Testset:
Accuracy: 0.9298245614035088
F1(malignant=pos): 0.9090909090909091
ROC-AUC: 0.008516213560432362

Classification report (TEST):
              precision    recall  f1-score   support

   malignant       0.89      0.93      0.91        43
      benign       0.96      0.93      0.94        71

    accuracy                           0.93       114
   macro avg       0.92      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114

Confusion matrix [[TP_mal, FP_mal],[FN_mal, TN_mal]]:
 [[40  3]
 [ 5 66]]
