In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd


In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Load dataset Breast Cancer (binary classification)
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts(normalize=True))

# 2. Train/Test split (giữ tỉ lệ lớp với stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

# 3. Scale features (rất quan trọng cho Logistic, SVM, XGBoost)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled shapes:", X_train_scaled.shape, X_test_scaled.shape)


X shape: (569, 30)
y distribution:
 target
1    0.627417
0    0.372583
Name: proportion, dtype: float64
Train size: (455, 30) Test size: (114, 30)
Scaled shapes: (455, 30) (114, 30)


In [3]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)

rf_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf_grid = GridSearchCV(
    estimator=rf_clf,
    param_grid=rf_param_grid,
    cv=5,
    scoring="f1_weighted",
    n_jobs=-1
)

rf_grid.fit(X_train_scaled, y_train)
print("Best params (RF):", rf_grid.best_params_)
print("Best CV score:", rf_grid.best_score_)

best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(X_test_scaled)
print("Test F1 (RF tuned):", f1_score(y_test, y_pred_rf, average="weighted"))


Best params (RF): {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9604072600802273
Test F1 (RF tuned): 0.9560273762928302
