<a href="https://colab.research.google.com/github/Napawan2005/Machine-Learning-68/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, balanced_accuracy_score, confusion_matrix
from scipy.stats import randint, loguniform

In [11]:
# ==== 1) Load dataset ====
# วิธี A: ถ้าใช้ kagglehub
import kagglehub
dataset_path = Path(kagglehub.dataset_download("andrewmvd/fetal-health-classification"))
df = pd.read_csv(dataset_path / "fetal_health.csv")

Using Colab cache for faster access to the 'fetal-health-classification' dataset.


In [12]:
TARGET = "fetal_health"
X = df.drop(columns=[TARGET]).select_dtypes(include=[np.number])
y = df[TARGET].astype(int)

In [13]:
# ==== 2) Train/Test split (stratify เพื่อคงสัดส่วนคลาส) ====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [19]:
# ==== 3) Pipeline: Standardize -> PCA -> RandomForest ====
pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),  # สำคัญก่อนทำ PCA
    ("pca", PCA(random_state=42)),
    ("clf", RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        class_weight="balanced",
        n_estimators=503,
        criterion="gini",
        max_features="sqrt",
        max_leaf_nodes=49,
        min_samples_leaf=3,        # ใช้ค่าเดียว (เดิมใส่ซ้ำ)
        min_samples_split=2,       # แนะนำเพิ่มเพื่อคุมการแตกโหนด (ปรับได้ 2–50)
        bootstrap=True,
        oob_score=True,            # ใช้ได้ใน CV แต่เป็นเมตริกของ train-fold เท่านั้น

    )),
])

In [20]:
# ==== 4) CV & Scoring ====
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"f1_macro": "f1_macro", "bal_acc": "balanced_accuracy"}

In [21]:
# ==== 5) Hyperparameter search space ====
# - pca__n_components: เลือกจำนวนคอมโพเนนต์ 5..min(30, n_features)
# - clf params: ปรับขนาดป่า / ความลึก / จำนวนใบ ฯลฯ
n_features = X_train.shape[1]
param_dist = {
    "pca__n_components": randint(5, min(30, n_features)+1),

}

In [24]:
search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=25,                # ปรับได้ตามงบเวลา
    cv=cv,
    scoring=scoring,
    refit="f1_macro",         # โมเดลสุดท้ายจะเลือกตาม f1_macro
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=False,
)


In [25]:
# ==== 6) Train ====
search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("CV best f1_macro:", search.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best params: {'pca__n_components': 16}
CV best f1_macro: 0.845343317600291


In [26]:
# ==== 7) Evaluate on test set ====
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test F1_macro:", f1_score(y_test, y_pred, average="macro"))
print("Test Balanced Acc.:", balanced_accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


Test F1_macro: 0.8195587777224213
Test Balanced Acc.: 0.8154321888036408
Confusion matrix:
 [[312  19   1]
 [ 17  40   2]
 [  1   5  29]]
              precision    recall  f1-score   support

           1     0.9455    0.9398    0.9426       332
           2     0.6250    0.6780    0.6504        59
           3     0.9062    0.8286    0.8657        35

    accuracy                         0.8944       426
   macro avg     0.8256    0.8154    0.8196       426
weighted avg     0.8979    0.8944    0.8958       426



In [28]:
# ==== 8) Extra: ตรวจดู PCA ที่ถูกเลือก ====
pca_step = best_model.named_steps["pca"]
print("Chosen n_components:", pca_step.n_components_)
print("Explained variance ratio (first 10):", np.round(pca_step.explained_variance_ratio_[:10], 4))
print("Cumulative explained variance:", np.round(np.cumsum(pca_step.explained_variance_ratio_), 4))

Chosen n_components: 16
Explained variance ratio (first 10): [0.2849 0.1699 0.0885 0.072  0.059  0.0492 0.0451 0.0435 0.037  0.03  ]
Cumulative explained variance: [0.2849 0.4548 0.5433 0.6153 0.6743 0.7234 0.7685 0.812  0.8491 0.879
 0.9061 0.9294 0.9475 0.9632 0.9756 0.9842]


In [27]:
# ==== 9) Extra: OOB score ของ RF (จากตัวใน pipeline)
rf_step = best_model.named_steps["clf"]
print("OOB score (train-fold level):", getattr(rf_step, "oob_score_", None))

OOB score (train-fold level): 0.8970588235294118
