In [1]:
# ==== diet_model_pipeline_full_final2.py ====
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib

# -------------------------------------------------
# 1) หาไฟล์ CSV อัตโนมัติ (ปรับ ROOT_DIRS ให้ตรงเครื่อง)
# -------------------------------------------------
ROOT_DIRS = [
    Path.cwd(),                                  # โฟลเดอร์ที่รันสคริปต์
    Path(r"C:\Users\ACER\OneDrive\Desktop\s3"),  # ตัวอย่างตำแหน่งที่คุณใช้งาน
]
PATTERNS = ["*data*set*is*.csv", "*dataset*is*.csv", "*.csv"]

def find_csv(roots, patterns):
    for root in roots:
        if not root.exists():
            continue
        for pat in patterns:
            for p in root.glob(pat):
                if p.is_file():
                    return p.resolve()
    return None

csv_path = find_csv(ROOT_DIRS, PATTERNS)
if not csv_path:
    raise FileNotFoundError("ไม่พบไฟล์ CSV ใน ROOT_DIRS — กรุณาแก้ ROOT_DIRS หรือวางไฟล์ไว้โฟลเดอร์เดียวกับสคริปต์")
print(f"✅ ใช้ไฟล์: {csv_path}")

# -------------------------------------------------
# 2) อ่านข้อมูล (fallback หลาย encoding/sep)
# -------------------------------------------------
read_ok, last_err = False, None
for sep in [None, ",", ";", "\t", "|"]:
    for enc in ["utf-8-sig", "utf-8", "cp874", "latin-1"]:
        try:
            df = pd.read_csv(csv_path, sep=sep, encoding=enc, engine="python")
            read_ok = True
            print(f"   → อ่านสำเร็จด้วย sep={repr(sep)}, encoding='{enc}'")
            break
        except Exception as e:
            last_err = e
    if read_ok:
        break
if not read_ok:
    raise RuntimeError(f"อ่านไฟล์ไม่สำเร็จ: {last_err}")

# -------------------------------------------------
# 3) แยก Target / Features
# -------------------------------------------------
POSSIBLE_TARGETS = ["Diet_Recommendation", "diet_recommendation", "Target"]
target_col = next((c for c in POSSIBLE_TARGETS if c in df.columns), None)
if not target_col:
    raise KeyError(f"ไม่พบคอลัมน์ Target ใน {list(df.columns)} — กรุณาแก้ POSSIBLE_TARGETS ให้ตรงชื่อจริง")

y = df[target_col]
X = df.drop(columns=[target_col])

print("🎯 ตัวอย่าง Target:", y.unique()[:5])
print("📊 สัดส่วนคลาส:\n", y.value_counts(normalize=True).round(3))

# -------------------------------------------------
# 4) แบ่งชนิดคอลัมน์ + เข้ารหัส Target
# -------------------------------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols     = X.select_dtypes(include=[np.number]).columns.tolist()

le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("🔖 Target mapping:", {cls: int(i) for i, cls in enumerate(le.classes_)})

# -------------------------------------------------
# 5) Train/Test split
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.30, random_state=42, stratify=y_encoded
)

# -------------------------------------------------
# 6) Preprocessor (รองรับ sklearn เก่า/ใหม่)
# -------------------------------------------------
try:
    categorical_tf = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # >=1.2
except TypeError:
    categorical_tf = OneHotEncoder(handle_unknown="ignore", sparse=False)         # <1.2

numeric_tf = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_tf, categorical_cols),
        ("num", numeric_tf, numeric_cols),
    ],
    remainder="drop"
)

# -------------------------------------------------
# 7) Pipeline สำหรับฝึก (ใช้ SMOTE ถ้ามี)
# -------------------------------------------------
use_smote = False
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    smote = SMOTE(random_state=42)
    use_smote = True
    print("🔄 จะใช้ SMOTE ระหว่างฝึก (พบ imbalanced-learn)")
except Exception:
    smote = None
    print("ℹ️ ไม่พบ imbalanced-learn → ข้าม SMOTE (ติดตั้ง: pip install imbalanced-learn)")

rf = RandomForestClassifier(random_state=42, class_weight="balanced")
if use_smote:
    train_pipe = ImbPipeline([("prep", preprocessor), ("smote", smote), ("model", rf)])
else:
    train_pipe = SkPipeline([("prep", preprocessor), ("model", rf)])

# -------------------------------------------------
# 8) RandomizedSearch + CV
# -------------------------------------------------
param_dist = {
    "model__n_estimators": [150, 250, 400, 600],
    "model__max_depth": [None, 8, 12, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", None],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=train_pipe,
    param_distributions=param_dist,
    n_iter=12,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
)
search.fit(X_train, y_train)
print(f"\n🏆 Best CV F1_macro: {search.best_score_:.4f}")
print("🔧 Best params:", search.best_params_)

best_model = search.best_estimator_

# -------------------------------------------------
# 9) สร้าง inference_model สำหรับทำนายจริง (ไม่มี SMOTE)
# -------------------------------------------------
if "smote" in best_model.named_steps:
    prep_fitted  = best_model.named_steps["prep"]
    model_fitted = best_model.named_steps["model"]
    inference_model = SkPipeline([("prep", prep_fitted), ("model", model_fitted)])
else:
    inference_model = best_model

# -------------------------------------------------
# 10) ประเมินบน Test set
# -------------------------------------------------
y_pred = inference_model.predict(X_test)
print("\n📈 Test Accuracy :", round(accuracy_score(y_test, y_pred), 4))
print("📊 Test F1_macro :", round(f1_score(y_test, y_pred, average='macro'), 4))
print("\n📋 Classification report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------------------------
# 11) บันทึกโมเดล + LabelEncoder
# -------------------------------------------------
joblib.dump(inference_model, "diet_recommendation_model.joblib")
joblib.dump(le, "label_encoder.joblib")
print("\n💾 Saved: diet_recommendation_model.joblib, label_encoder.joblib")

# -------------------------------------------------
# 12) ฟังก์ชันทำนาย 1 รายการ (กันพัง: บังคับคอลัมน์/ชนิดข้อมูลให้ตรงกับตอนเทรน)
# -------------------------------------------------
def predict_one(sample_dict: dict):
    """
    รับ dict 1 รายการ (คีย์ต้องตรงกับคอลัมน์ X ที่ใช้เทรน)
    คืน: (label_text, proba_by_class: dict)
    """
    sample_df = pd.DataFrame([sample_dict])

    # เรียงคอลัมน์ให้ตรงกับ X และเติมคอลัมน์ที่หายไป
    sample_df = sample_df.reindex(columns=X.columns)

    # บังคับชนิดข้อมูลให้ตรง
    for col in numeric_cols:
        sample_df[col] = pd.to_numeric(sample_df[col], errors="coerce")
    for col in categorical_cols:
        # ถ้าเป็น NaN ให้แปลงเป็น string "nan" เพื่อไม่ให้ scaler/encoder ล้ม
        sample_df[col] = sample_df[col].astype(str)

    pred_num   = inference_model.predict(sample_df)[0]
    pred_label = le.inverse_transform([pred_num])[0]
    proba      = inference_model.predict_proba(sample_df)[0]
    return pred_label, dict(zip(le.classes_, map(float, proba)))

# -------------------------------------------------
# 13) ตัวอย่างเรียกใช้งาน + บล็อก DEBUG
# -------------------------------------------------
ex = X.iloc[0].to_dict()          # หรือใช้ X_test.iloc[0].to_dict() ก็ได้
label, proba = predict_one(ex)
print("\n🧪 Example prediction:", label, proba)

# (ถ้าติด ให้ดูบล็อกตรวจสอบด้านล่าง)
print("\n[DEBUG] predict_one defined? ->", callable(globals().get("predict_one", None)))
print("[DEBUG] inference predict sanity ->", inference_model.predict(X_test.head(1)))
tmp_before = pd.DataFrame([ex]).reindex(columns=X.columns)
print("[DEBUG] dtypes of X.head(1):\n", X.head(1).dtypes)
print("[DEBUG] dtypes of sample before cast:\n", tmp_before.dtypes)
missing = set(X.columns) - set(tmp_before.columns)
extra   = set(tmp_before.columns) - set(X.columns)
print("[DEBUG] keys mismatch? missing:", missing, " extra:", extra)


✅ ใช้ไฟล์: C:\Users\ACER\OneDrive\Desktop\is3\black end\datasetis.csv
   → อ่านสำเร็จด้วย sep=None, encoding='utf-8-sig'
🎯 ตัวอย่าง Target: ['Balanced' 'Low_Carb' 'Low_Sodium']
📊 สัดส่วนคลาส:
 Diet_Recommendation
Balanced      0.426
Low_Sodium    0.316
Low_Carb      0.258
Name: proportion, dtype: float64
🔖 Target mapping: {'Balanced': 0, 'Low_Carb': 1, 'Low_Sodium': 2}
🔄 จะใช้ SMOTE ระหว่างฝึก (พบ imbalanced-learn)
Fitting 5 folds for each of 12 candidates, totalling 60 fits

🏆 Best CV F1_macro: 1.0000
🔧 Best params: {'model__n_estimators': 150, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_features': None, 'model__max_depth': 20}

📈 Test Accuracy : 1.0
📊 Test F1_macro : 1.0

📋 Classification report:
               precision    recall  f1-score   support

    Balanced       1.00      1.00      1.00       128
    Low_Carb       1.00      1.00      1.00        77
  Low_Sodium       1.00      1.00      1.00        95

    accuracy                           1.00 

In [7]:
# ===== XGBoost + WEKA-style summary (ต่อจาก Random Forest baseline) =====
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix,
    precision_recall_fscore_support, cohen_kappa_score
)

# 0) ติดตั้ง/นำเข้า XGBoost
try:
    from xgboost import XGBClassifier
except Exception as e:
    raise ImportError("ต้องติดตั้ง xgboost ก่อน: pip install xgboost") from e

# 1) ถ้ามีตัวแปรจาก baseline แล้ว (X, y, le, categorical_cols, numeric_cols, preprocessor)
#    จะใช้ต่อทันที; ถ้ายังไม่มี ให้โหลด/เตรียมให้แบบย่อ:
if 'X' not in globals():
    import pandas as pd
    from pathlib import Path
    csv_path = Path(r"C:\Users\ACER\OneDrive\Desktop\s3\data set is.csv")  # แก้ให้ตรงเครื่อง
    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    target_col = "Diet_Recommendation"
    y_text = df[target_col]
    X = df.drop(columns=[target_col])
    le = LabelEncoder()
    y = le.fit_transform(y_text)
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    preprocessor = ColumnTransformer(
        [("cat", ohe, categorical_cols), ("num", StandardScaler(), numeric_cols)],
        remainder="drop"
    )

# 2) แบ่งชุด (คง random_state/stratify ให้เปรียบเทียบกับ baseline ได้ยุติธรรม)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# 3) โมเดล XGBoost + Pipeline
xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=len(np.unique(y)),
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42,
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
)
xgb_pipe = Pipeline([("prep", preprocessor), ("model", xgb)])

# 4) เทรน
xgb_pipe.fit(X_train, y_train)

# 5) ทำนาย
y_pred = xgb_pipe.predict(X_test)

# 6) ===== WEKA-style SUMMARY (ครบทุกหัวข้อ) =====
# ทำ y_true/y_pred ให้เป็น Series ผูก index กับ X_test แล้วกรอง unknown classes
classes = list(le.classes_)
y_true_s = pd.Series(y_test, index=X_test.index)       # เป็น label-encoded (ตัวเลข)
y_pred_s = pd.Series(y_pred, index=X_test.index)
mask = y_true_s.notna() & y_true_s.isin(range(len(classes)))  # ignore class unknown instances

y_true = y_true_s.loc[mask].to_numpy()
y_pred = y_pred_s.loc[mask].to_numpy()
X_eval = X_test.loc[mask]

# เมตริกหลัก
acc = accuracy_score(y_true, y_pred)
f1m = f1_score(y_true, y_pred, average="macro")
kappa = cohen_kappa_score(y_true, y_pred)
print("\n===== XGBoost (ต่อจาก Baseline) =====")
print(f"Accuracy: {acc:.4f}")
print(f"F1_macro: {f1m:.4f}")
print(f"Kappa: {kappa:.4f}")
print(f"Total number of instances (after ignore): {len(y_true)}")

# Detailed accuracy by class
prec, rec, f1, sup = precision_recall_fscore_support(
    y_true, y_pred, labels=range(len(classes)), zero_division=0
)
print("\n📊 Detailed Accuracy By Class")
print(f"{'Class':<20}{'Precision':>10}{'Recall':>10}{'F1-Score':>10}{'Support':>10}")
print("-"*60)
for i, cls in enumerate(classes):
    print(f"{cls:<20}{prec[i]:>10.3f}{rec[i]:>10.3f}{f1[i]:>10.3f}{int(sup[i]):>10}")

# Confusion matrix (เลขดิบ)
cm = confusion_matrix(y_true, y_pred, labels=range(len(classes)))
print("\n🧩 Confusion Matrix (rows=true, cols=predicted)")
print(" " * 12 + "  ".join([f"{c:>10}" for c in classes]))
for i, cls in enumerate(classes):
    row_str = "  ".join([f"{n:>10}" for n in cm[i]])
    print(f"{cls:<12}{row_str}")

# Correct / Incorrect
correct = int((y_true == y_pred).sum())
total = int(len(y_true))
incorrect = total - correct
print(f"\n✅ Correctly classified instances:   {correct} / {total}  ({correct/total*100:.2f}%)")
print(f"❌ Incorrectly classified instances: {incorrect} / {total}  ({incorrect/total*100:.2f}%)")

# MAE / RMSE (probability-based) และ RAE / RRSE (baseline = prior ของชุดประเมิน)
proba = xgb_pipe.predict_proba(X_eval)                 # [N, n_classes]
y_onehot = np.eye(len(classes))[y_true]                # one-hot ของป้ายจริง

abs_err = np.abs(y_onehot - proba).sum(axis=1) / 2.0
mae = abs_err.mean()
sq_err = ((y_onehot - proba) ** 2).sum(axis=1) / 2.0
rmse = np.sqrt(sq_err.mean())
print(f"\n📐 Mean absolute error (MAE): {mae:.6f}")
print(f"📐 Root mean squared error (RMSE): {rmse:.6f}")

prior_counts = pd.Series(y_true).value_counts().reindex(range(len(classes)), fill_value=0).values
prior_dist = prior_counts / prior_counts.sum()
abs_err_base = np.abs(y_onehot - prior_dist).sum(axis=1) / 2.0
mae_base = abs_err_base.mean()
sq_err_base = ((y_onehot - prior_dist) ** 2).sum(axis=1) / 2.0
rmse_base = np.sqrt(sq_err_base.mean())

rae  = (mae / mae_base) * 100.0 if mae_base > 0 else float("inf")
rrse = (rmse / rmse_base) * 100.0 if rmse_base > 0 else float("inf")
print(f"📏 Relative absolute error (RAE): {rae:.2f}%")
print(f"📏 Root relative squared error (RRSE): {rrse:.2f}%")



===== XGBoost (ต่อจาก Baseline) =====
Accuracy: 1.0000
F1_macro: 1.0000
Kappa: 1.0000
Total number of instances (after ignore): 300

📊 Detailed Accuracy By Class
Class                Precision    Recall  F1-Score   Support
------------------------------------------------------------
Balanced                 1.000     1.000     1.000       128
Low_Carb                 1.000     1.000     1.000        77
Low_Sodium               1.000     1.000     1.000        95

🧩 Confusion Matrix (rows=true, cols=predicted)
              Balanced    Low_Carb  Low_Sodium
Balanced           128           0           0
Low_Carb             0          77           0
Low_Sodium           0           0          95

✅ Correctly classified instances:   300 / 300  (100.00%)
❌ Incorrectly classified instances: 0 / 300  (0.00%)

📐 Mean absolute error (MAE): 0.002956
📐 Root mean squared error (RMSE): 0.002619
📏 Relative absolute error (RAE): 0.45%
📏 Root relative squared error (RRSE): 0.46%
