In [1]:
pip install catboost joblib

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.3.0-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-2.1.1-py3-none-any.whl.metadata (11 kB)
Downloading catboost-1.2.8-cp311-cp311-win_amd64.whl (102.5 MB)
   ---------------------------------------- 0.0/102.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/102.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/102.5 MB 1.7 MB/s eta 0:01:02
   ---------------------------------------- 0.6/102.5 MB 5.3 MB/s eta 0:00:20
    --------------------------------------- 1.3/102.5 MB 8.2 MB/s eta 0:00:13
    --------------------------------------- 2.3/102.5 MB 11.2 MB/s eta 0:00:09
   - -------------------------------------- 3.4/102.5 MB 13.4 MB/s eta 0:00:08



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\ACER\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [9]:
# ==== catboost_full_auto_weka_fixed_rmse.py ====
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    cohen_kappa_score, mean_absolute_error, mean_squared_error
)
from catboost import CatBoostClassifier, Pool
import joblib

# ---------------- 1) หา/โหลดไฟล์ CSV (กันพัง) ----------------
CSV_PATH = Path("data set is.csv")  # ถ้าไฟล์อยู่โฟลเดอร์เดียวกับสคริปต์ ให้ใช้ชื่อไฟล์ได้เลย
if not CSV_PATH.exists():
    ROOT_DIRS = [
        Path.cwd(),
        Path.home() / "Desktop",
        Path.home() / "OneDrive" / "Desktop" / "s3",
    ]
    PATTERNS = ["*data*set*is*.csv", "*dataset*is*.csv", "*.csv"]

    def find_csv(roots, patterns):
        for r in roots:
            if not r.exists():
                continue
            for pat in patterns:
                for p in r.glob(pat):
                    if p.is_file():
                        return p.resolve()
        return None

    CSV_PATH = find_csv(ROOT_DIRS, PATTERNS)
    if not CSV_PATH:
        raise FileNotFoundError(
            "ไม่พบไฟล์ CSV — แก้ CSV_PATH ให้เป็นพาธเต็ม หรือวางไฟล์ไว้โฟลเดอร์เดียวกับสคริปต์"
        )

# ลองอ่านหลาย encoding/sep อัตโนมัติ
read_ok, last_err = False, None
for sep in [None, ",", ";", "\t", "|"]:
    for enc in ["utf-8-sig", "utf-8", "cp874", "latin-1"]:
        try:
            df = pd.read_csv(CSV_PATH, sep=sep, encoding=enc, engine="python")
            print(f"✅ ใช้ไฟล์: {CSV_PATH} (sep={repr(sep)}, enc='{enc}')")
            read_ok = True
            break
        except Exception as e:
            last_err = e
    if read_ok:
        break
if not read_ok:
    raise RuntimeError(f"อ่านไฟล์ไม่สำเร็จ: {last_err}")

# ---------------- 2) Target / Features ----------------
POSSIBLE_TARGETS = ["Diet_Recommendation", "diet_recommendation", "Target"]
target_col = next((c for c in POSSIBLE_TARGETS if c in df.columns), None)
if not target_col:
    raise KeyError(f"ไม่พบคอลัมน์ Target ใน {list(df.columns)}")

y_text = df[target_col]
X = df.drop(columns=[target_col])

# ---------------- 3) จัดการ Missing ----------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols     = X.select_dtypes(include=[np.number]).columns.tolist()

# หมวดหมู่: CatBoost ต้องไม่ใช่ NaN → แปลงเป็น 'missing' และเป็นสตริงเสมอ
if categorical_cols:
    X[categorical_cols] = (
        X[categorical_cols]
        .astype("object")
        .where(~X[categorical_cols].isna(), other="missing")
        .replace(["nan", "NaN", "None", "NULL", "<NA>"], "missing")
        .astype(str)
    )
# ตัวเลข: อิมพิวต์ NaN เป็น median (กันพังเบื้องต้น)
for col in numeric_cols:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())

# ---------------- 4) Encode Target ----------------
le = LabelEncoder()
y = le.fit_transform(y_text)
print("🔖 Target mapping:", {cls: int(i) for i, cls in enumerate(le.classes_)})

# ---------------- 5) Train/Val/Test split ----------------
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.15, random_state=42, stratify=y_train_full
)

# ---------------- 6) CatBoost Pools ----------------
cat_idx = [X.columns.get_loc(c) for c in categorical_cols]
train_pool = Pool(X_train, y_train, cat_features=cat_idx)
val_pool   = Pool(X_val,   y_val,   cat_features=cat_idx)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_idx)

# ---------------- 7) โมเดล CatBoost ----------------
cb = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="TotalF1",
    iterations=1200,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    bootstrap_type="Bernoulli",  # เพื่อใช้ subsample ได้
    subsample=0.8,
    rsm=0.8,
    random_seed=42,
    early_stopping_rounds=100,
    verbose=100,
)
cb.fit(train_pool, eval_set=val_pool, use_best_model=True)

# ---------------- 8) ทำนาย ----------------
y_pred = cb.predict(test_pool).astype(int).reshape(-1)
proba  = cb.predict_proba(test_pool)

# ---------------- 9) WEKA-style Summary ----------------
acc  = accuracy_score(y_test, y_pred)
f1m  = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)["macro avg"]["f1-score"]
kap  = cohen_kappa_score(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
# ✅ รองรับทุก sklearn: คำนวณ RMSE ด้วยการถอดราก MSE เอง
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Relative errors เทียบ prior distribution (สำหรับ WEKA-style RAE / RRSE)
prior = np.bincount(y_test) / len(y_test)
y_onehot = np.eye(len(le.classes_))[y_test]
abs_err  = np.abs(y_onehot - proba).sum(axis=1) / 2.0
sq_err   = ((y_onehot - proba) ** 2).sum(axis=1) / 2.0
mae_base = np.abs(y_onehot - prior).sum(axis=1) / 2.0
rmse_base = np.sqrt(((y_onehot - prior) ** 2).sum(axis=1) / 2.0)
rae  = (abs_err.mean() / mae_base.mean()) * 100 if mae_base.mean() > 0 else float("inf")
rrse = (np.sqrt(sq_err.mean()) / rmse_base.mean()) * 100 if rmse_base.mean() > 0 else float("inf")

print("\n========== WEKA-STYLE (CatBoost) ==========")
print(f"Total number of instances (after ignore): {len(y_test)}")
print(f"Accuracy: {acc:.4f}")
print(f"F1_macro: {f1m:.4f}")
print(f"Kappa statistic: {kap:.4f}\n")

print("📊 Detailed Accuracy By Class")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("🧩 Confusion Matrix (rows=true, cols=predicted)")
print(confusion_matrix(y_test, y_pred))

print(f"\n✅ Correctly classified instances: {(y_pred == y_test).sum()} / {len(y_test)} ({acc*100:.2f}%)")
print(f"❌ Incorrectly classified instances: {(y_pred != y_test).sum()} / {len(y_test)} ({(1-acc)*100:.2f}%)")
print(f"\n📏 Mean absolute error (MAE): {mae:.6f}")
print(f"📐 Root mean squared error (RMSE): {rmse:.6f}")
print(f"📏 Relative absolute error (RAE): {rae:.2f}%")
print(f"📐 Root relative squared error (RRSE): {rrse:.2f}%")

# ---------------- 10) Save model ----------------
cb.save_model("diet_catboost_model.cbm")
joblib.dump(le, "label_encoder.joblib")
print("\n💾 Saved: diet_catboost_model.cbm, label_encoder.joblib")

# ---------------- 11) predict_one() ----------------
def predict_one(sample_dict: dict):
    """
    รับ dict 1 รายการ (คีย์ต้องตรงกับคอลัมน์ X ตอนเทรน)
    คืน: (label_text, proba_by_class: dict)
    """
    sample = pd.DataFrame([sample_dict]).reindex(columns=X.columns)
    # บังคับชนิดข้อมูลให้เข้ากัน
    for col in numeric_cols:
        sample[col] = pd.to_numeric(sample[col], errors="coerce")
        if sample[col].isna().any():
            sample[col] = sample[col].fillna(X[col].median())
    if categorical_cols:
        sample[categorical_cols] = (
            sample[categorical_cols]
            .astype("object")
            .where(~sample[categorical_cols].isna(), other="missing")
            .astype(str)
        )
    pool = Pool(sample, cat_features=cat_idx)
    pred = int(cb.predict(pool)[0])
    prob = cb.predict_proba(pool)[0]
    return le.inverse_transform([pred])[0], dict(zip(le.classes_, map(float, prob)))

# ตัวอย่างใช้งาน
demo = X_test.iloc[0].to_dict()
lbl, prob = predict_one(demo)
print("\n🧪 Example prediction:", lbl, prob)


✅ ใช้ไฟล์: C:\Users\ACER\OneDrive\Desktop\is3\black end\datasetis.csv (sep=None, enc='utf-8-sig')
🔖 Target mapping: {'Balanced': 0, 'Low_Carb': 1, 'Low_Sodium': 2}
0:	learn: 0.9966403	test: 1.0000000	best: 1.0000000 (0)	total: 61.7ms	remaining: 1m 13s
100:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 6.14s	remaining: 1m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1
bestIteration = 0

Shrink model to first 1 iterations.

Total number of instances (after ignore): 300
Accuracy: 1.0000
F1_macro: 1.0000
Kappa statistic: 1.0000

📊 Detailed Accuracy By Class
              precision    recall  f1-score   support

    Balanced       1.00      1.00      1.00       128
    Low_Carb       1.00      1.00      1.00        77
  Low_Sodium       1.00      1.00      1.00        95

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

🧩 Confusion Matrix (

  pred = int(cb.predict(pool)[0])


In [8]:
pip install --upgrade scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Using cached scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.7.1
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.1 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\ACER\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
