<a href="https://colab.research.google.com/github/Napawan2005/Machine-Learning-68/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, balanced_accuracy_score, confusion_matrix
from scipy.stats import randint, loguniform

In [46]:
# ==== 1) Load dataset ====
# วิธี A: ถ้าใช้ kagglehub
import kagglehub
dataset_path = Path(kagglehub.dataset_download("andrewmvd/fetal-health-classification"))
df = pd.read_csv(dataset_path / "fetal_health.csv")

Using Colab cache for faster access to the 'fetal-health-classification' dataset.


In [47]:

print("ขนาดก่อนลบ:", df.shape)
df = df.drop_duplicates(keep="first")
print("ขนาดหลังลบ:", df.shape)


ขนาดก่อนลบ: (2126, 22)
ขนาดหลังลบ: (2113, 22)


In [48]:
TARGET = "fetal_health"
X = df.drop(columns=[TARGET]).select_dtypes(include=[np.number])
y = df[TARGET].astype(int)

In [49]:
# ==== 2) Train/Test split (stratify เพื่อคงสัดส่วนคลาส) ====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [50]:

# ==== 3) Pipeline: Standardize -> PCA -> RandomForest ====
pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),  # สำคัญก่อนทำ PCA
    ("pca", PCA(random_state=42)),
    ("clf", RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        class_weight="balanced",
        n_estimators=503,
        criterion="gini",
        max_features="sqrt",
        max_leaf_nodes=49,
        min_samples_leaf=3,        # ใช้ค่าเดียว (เดิมใส่ซ้ำ)
        min_samples_split=2,       # แนะนำเพิ่มเพื่อคุมการแตกโหนด (ปรับได้ 2–50)
        bootstrap=True,
        oob_score=True,            # ใช้ได้ใน CV แต่เป็นเมตริกของ train-fold เท่านั้น

    )),
])

In [51]:
# ==== 4) CV & Scoring ====
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"f1_macro": "f1_macro", "bal_acc": "balanced_accuracy"}

In [52]:
# ==== 5) Hyperparameter search space ====
# - pca__n_components: เลือกจำนวนคอมโพเนนต์ 5..min(30, n_features)
# - clf params: ปรับขนาดป่า / ความลึก / จำนวนใบ ฯลฯ
n_features = X_train.shape[1]
param_dist = {
    "pca__n_components": randint(5, min(30, n_features)+1),

}

In [53]:
search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=X.shape[1],                # ปรับได้ตามงบเวลา
    cv=cv,
    scoring=scoring,
    refit="f1_macro",         # โมเดลสุดท้ายจะเลือกตาม f1_macro
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=False,
)


In [54]:
# ==== 6) Train ====
search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("CV best f1_macro:", search.best_score_)

Fitting 5 folds for each of 21 candidates, totalling 105 fits
Best params: {'pca__n_components': 16}
CV best f1_macro: 0.836076854350549


In [55]:
# ==== 7) Evaluate on test set ====
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test F1_macro:", f1_score(y_test, y_pred, average="macro"))
print("Test Balanced Acc.:", balanced_accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


Test F1_macro: 0.8177307742525134
Test Balanced Acc.: 0.8070159725332139
Confusion matrix:
 [[315  12   3]
 [ 19  37   2]
 [  2   4  29]]
              precision    recall  f1-score   support

           1     0.9375    0.9545    0.9459       330
           2     0.6981    0.6379    0.6667        58
           3     0.8529    0.8286    0.8406        35

    accuracy                         0.9007       423
   macro avg     0.8295    0.8070    0.8177       423
weighted avg     0.8977    0.9007    0.8989       423



In [56]:
# ==== 8) Extra: ตรวจดู PCA ที่ถูกเลือก ====
pca_step = best_model.named_steps["pca"]
print("Chosen n_components:", pca_step.n_components_)
print("Explained variance ratio (first 10):", np.round(pca_step.explained_variance_ratio_[:10], 4))
print("Cumulative explained variance:", np.round(np.cumsum(pca_step.explained_variance_ratio_), 4))

Chosen n_components: 16
Explained variance ratio (first 10): [0.2887 0.1684 0.0891 0.0717 0.0578 0.0485 0.0467 0.0435 0.0361 0.0301]
Cumulative explained variance: [0.2887 0.4571 0.5463 0.618  0.6758 0.7244 0.7711 0.8146 0.8507 0.8809
 0.9084 0.9315 0.9489 0.964  0.9767 0.9851]


In [57]:
# ==== 9) Extra: OOB score ของ RF (จากตัวใน pipeline)
rf_step = best_model.named_steps["clf"]
print("OOB score (train-fold level):", getattr(rf_step, "oob_score_", None))

OOB score (train-fold level): 0.8988165680473372


In [58]:
oob_proba = rf_step.oob_decision_function_
valid = ~np.isnan(oob_proba).any(axis=1)  # กันกรณีบางแถวไม่มี OOB votes พอ
y_true = np.asarray(y_train)[valid]
oob_pred = rf_step.classes_[oob_proba[valid].argmax(axis=1)]

print("OOB F1_macro:", f1_score(y_true, oob_pred, average="macro"))
print("OOB Balanced Acc.:", balanced_accuracy_score(y_true, oob_pred))

OOB F1_macro: 0.8331580409518095
OOB Balanced Acc.: 0.838864209076975
