<a href="https://colab.research.google.com/github/Napawan2005/Machine-Learning-68/blob/main/feature_selection_(ANOVA).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report
from scipy.stats import randint

import kagglehub  # ใช้โหลด dataset จาก Kaggle อัตโนมัติ

In [3]:

# ดาวน์โหลดและอ่านไฟล์
dataset_path = kagglehub.dataset_download("andrewmvd/fetal-health-classification")
file_path = Path(dataset_path) / "fetal_health.csv"
df = pd.read_csv(file_path)

TARGET = "fetal_health"
X = df.drop(columns=[TARGET]).select_dtypes(include=[np.number])
y = df[TARGET].astype(int)


print("ขนาดก่อนลบ:", df.shape)
data = df.drop_duplicates(keep="first")
print("ขนาดหลังลบ:", df.shape)

# แบ่ง train/test เพื่อประเมินสุดท้าย (นอกเหนือจาก CV)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


Using Colab cache for faster access to the 'fetal-health-classification' dataset.
ขนาดก่อนลบ: (2126, 22)
ขนาดหลังลบ: (2126, 22)


In [4]:
pipe = Pipeline(steps=[
    ("sel", SelectKBest(score_func=f_classif )),
    ("clf", RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        class_weight="balanced",
        n_estimators=503,
        criterion="gini",
        max_features="sqrt",
        max_leaf_nodes=49,
        min_samples_leaf=3,        # ใช้ค่าเดียว (เดิมใส่ซ้ำ)
        min_samples_split=2,       # แนะนำเพิ่มเพื่อคุมการแตกโหนด (ปรับได้ 2–50)
        bootstrap=True,
        oob_score=True,            # ใช้ได้ใน CV แต่เป็นเมตริกของ train-fold เท่านั้น

    )),
])


In [8]:
n_feats = X_train.shape[1]
param_distributions = {
    "sel__k": randint(5, min(50, n_feats)+1),     # << ค้นหา K ที่เหมาะสม
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"f1_macro": "f1_macro", "bal_acc": "balanced_accuracy"}

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter= X.shape[1],
    cv=cv,
    scoring=scoring,
    refit="f1_macro",           # เลือกโมเดลสุดท้ายด้วย f1_macro
    n_jobs=-1,
    random_state=42,
    verbose=1,
    return_train_score=False,
)


In [9]:
search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("Best CV f1_macro:", search.best_score_)

# ประเมินบน test set (ที่กันไว้ตั้งแต่ต้น)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test F1_macro:", f1_score(y_test, y_pred, average="macro"))
print("Test Balanced Acc.:", balanced_accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 21 candidates, totalling 105 fits
Best params: {'sel__k': 10}
Best CV f1_macro: 0.8901198852820891
Test F1_macro: 0.8566930441454191
Test Balanced Acc.: 0.8861893092952925
              precision    recall  f1-score   support

           1       0.97      0.93      0.95       332
           2       0.71      0.81      0.76        59
           3       0.82      0.91      0.86        35

    accuracy                           0.91       426
   macro avg       0.83      0.89      0.86       426
weighted avg       0.92      0.91      0.92       426



In [10]:
# ดึงตัวเลือกฟีเจอร์จากตัวคัดเลือกใน pipeline ที่ชนะ
sel = best_model.named_steps["sel"]
mask = sel.get_support()
selected_features = X_train.columns[mask].tolist()

print(f"Selected {len(selected_features)} features (k):", selected_features)


Selected 10 features (k): ['baseline value', 'accelerations', 'prolongued_decelerations', 'abnormal_short_term_variability', 'mean_value_of_short_term_variability', 'percentage_of_time_with_abnormal_long_term_variability', 'histogram_mode', 'histogram_mean', 'histogram_median', 'histogram_variance']


In [11]:
# --- OOB metrics (อ่านจาก RF ที่อยู่ใน pipeline ที่ fit แล้ว) ---
rf = best_model.named_steps["clf"]
print("OOB micro-accuracy (clf.oob_score_):", rf.oob_score_)

oob_proba = rf.oob_decision_function_
valid = ~np.isnan(oob_proba).any(axis=1)  # กันกรณีบางแถวไม่มี OOB votes พอ
y_true = np.asarray(y_train)[valid]
oob_pred = rf.classes_[oob_proba[valid].argmax(axis=1)]

print("OOB F1_macro:", f1_score(y_true, oob_pred, average="macro"))
print("OOB Balanced Acc.:", balanced_accuracy_score(y_true, oob_pred))

OOB micro-accuracy (clf.oob_score_): 0.9317647058823529
OOB F1_macro: 0.8909376601577121
OOB Balanced Acc.: 0.918110646911327
