In [1]:
# ==== diet_model_pipeline_full_final2.py ====
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib

# -------------------------------------------------
# 1) ‡∏´‡∏≤‡πÑ‡∏ü‡∏•‡πå CSV ‡∏≠‡∏±‡∏ï‡πÇ‡∏ô‡∏°‡∏±‡∏ï‡∏¥ (‡∏õ‡∏£‡∏±‡∏ö ROOT_DIRS ‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á)
# -------------------------------------------------
ROOT_DIRS = [
    Path.cwd(),                                  # ‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏£‡∏±‡∏ô‡∏™‡∏Ñ‡∏£‡∏¥‡∏õ‡∏ï‡πå
    Path(r"C:\Users\ACER\OneDrive\Desktop\s3"),  # ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ï‡∏≥‡πÅ‡∏´‡∏ô‡πà‡∏á‡∏ó‡∏µ‡πà‡∏Ñ‡∏∏‡∏ì‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
]
PATTERNS = ["*data*set*is*.csv", "*dataset*is*.csv", "*.csv"]

def find_csv(roots, patterns):
    for root in roots:
        if not root.exists():
            continue
        for pat in patterns:
            for p in root.glob(pat):
                if p.is_file():
                    return p.resolve()
    return None

csv_path = find_csv(ROOT_DIRS, PATTERNS)
if not csv_path:
    raise FileNotFoundError("‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå CSV ‡πÉ‡∏ô ROOT_DIRS ‚Äî ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÅ‡∏Å‡πâ ROOT_DIRS ‡∏´‡∏£‡∏∑‡∏≠‡∏ß‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÑ‡∏ß‡πâ‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏Å‡∏±‡∏ö‡∏™‡∏Ñ‡∏£‡∏¥‡∏õ‡∏ï‡πå")
print(f"‚úÖ ‡πÉ‡∏ä‡πâ‡πÑ‡∏ü‡∏•‡πå: {csv_path}")

# -------------------------------------------------
# 2) ‡∏≠‡πà‡∏≤‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• (fallback ‡∏´‡∏•‡∏≤‡∏¢ encoding/sep)
# -------------------------------------------------
read_ok, last_err = False, None
for sep in [None, ",", ";", "\t", "|"]:
    for enc in ["utf-8-sig", "utf-8", "cp874", "latin-1"]:
        try:
            df = pd.read_csv(csv_path, sep=sep, encoding=enc, engine="python")
            read_ok = True
            print(f"   ‚Üí ‡∏≠‡πà‡∏≤‡∏ô‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à‡∏î‡πâ‡∏ß‡∏¢ sep={repr(sep)}, encoding='{enc}'")
            break
        except Exception as e:
            last_err = e
    if read_ok:
        break
if not read_ok:
    raise RuntimeError(f"‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå‡πÑ‡∏°‡πà‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à: {last_err}")

# -------------------------------------------------
# 3) ‡πÅ‡∏¢‡∏Å Target / Features
# -------------------------------------------------
POSSIBLE_TARGETS = ["Diet_Recommendation", "diet_recommendation", "Target"]
target_col = next((c for c in POSSIBLE_TARGETS if c in df.columns), None)
if not target_col:
    raise KeyError(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå Target ‡πÉ‡∏ô {list(df.columns)} ‚Äî ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÅ‡∏Å‡πâ POSSIBLE_TARGETS ‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡∏ä‡∏∑‡πà‡∏≠‡∏à‡∏£‡∏¥‡∏á")

y = df[target_col]
X = df.drop(columns=[target_col])

print("üéØ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Target:", y.unique()[:5])
print("üìä ‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô‡∏Ñ‡∏•‡∏≤‡∏™:\n", y.value_counts(normalize=True).round(3))

# -------------------------------------------------
# 4) ‡πÅ‡∏ö‡πà‡∏á‡∏ä‡∏ô‡∏¥‡∏î‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå + ‡πÄ‡∏Ç‡πâ‡∏≤‡∏£‡∏´‡∏±‡∏™ Target
# -------------------------------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols     = X.select_dtypes(include=[np.number]).columns.tolist()

le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("üîñ Target mapping:", {cls: int(i) for i, cls in enumerate(le.classes_)})

# -------------------------------------------------
# 5) Train/Test split
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.30, random_state=42, stratify=y_encoded
)

# -------------------------------------------------
# 6) Preprocessor (‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö sklearn ‡πÄ‡∏Å‡πà‡∏≤/‡πÉ‡∏´‡∏°‡πà)
# -------------------------------------------------
try:
    categorical_tf = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # >=1.2
except TypeError:
    categorical_tf = OneHotEncoder(handle_unknown="ignore", sparse=False)         # <1.2

numeric_tf = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_tf, categorical_cols),
        ("num", numeric_tf, numeric_cols),
    ],
    remainder="drop"
)

# -------------------------------------------------
# 7) Pipeline ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ù‡∏∂‡∏Å (‡πÉ‡∏ä‡πâ SMOTE ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ)
# -------------------------------------------------
use_smote = False
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    smote = SMOTE(random_state=42)
    use_smote = True
    print("üîÑ ‡∏à‡∏∞‡πÉ‡∏ä‡πâ SMOTE ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏ù‡∏∂‡∏Å (‡∏û‡∏ö imbalanced-learn)")
except Exception:
    smote = None
    print("‚ÑπÔ∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö imbalanced-learn ‚Üí ‡∏Ç‡πâ‡∏≤‡∏° SMOTE (‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á: pip install imbalanced-learn)")

rf = RandomForestClassifier(random_state=42, class_weight="balanced")
if use_smote:
    train_pipe = ImbPipeline([("prep", preprocessor), ("smote", smote), ("model", rf)])
else:
    train_pipe = SkPipeline([("prep", preprocessor), ("model", rf)])

# -------------------------------------------------
# 8) RandomizedSearch + CV
# -------------------------------------------------
param_dist = {
    "model__n_estimators": [150, 250, 400, 600],
    "model__max_depth": [None, 8, 12, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", None],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=train_pipe,
    param_distributions=param_dist,
    n_iter=12,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
)
search.fit(X_train, y_train)
print(f"\nüèÜ Best CV F1_macro: {search.best_score_:.4f}")
print("üîß Best params:", search.best_params_)

best_model = search.best_estimator_

# -------------------------------------------------
# 9) ‡∏™‡∏£‡πâ‡∏≤‡∏á inference_model ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏à‡∏£‡∏¥‡∏á (‡πÑ‡∏°‡πà‡∏°‡∏µ SMOTE)
# -------------------------------------------------
if "smote" in best_model.named_steps:
    prep_fitted  = best_model.named_steps["prep"]
    model_fitted = best_model.named_steps["model"]
    inference_model = SkPipeline([("prep", prep_fitted), ("model", model_fitted)])
else:
    inference_model = best_model

# -------------------------------------------------
# 10) ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ö‡∏ô Test set
# -------------------------------------------------
y_pred = inference_model.predict(X_test)
print("\nüìà Test Accuracy :", round(accuracy_score(y_test, y_pred), 4))
print("üìä Test F1_macro :", round(f1_score(y_test, y_pred, average='macro'), 4))
print("\nüìã Classification report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("üß© Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------------------------------
# 11) ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÇ‡∏°‡πÄ‡∏î‡∏• + LabelEncoder
# -------------------------------------------------
joblib.dump(inference_model, "diet_recommendation_model.joblib")
joblib.dump(le, "label_encoder.joblib")
print("\nüíæ Saved: diet_recommendation_model.joblib, label_encoder.joblib")

# -------------------------------------------------
# 12) ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢ 1 ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£ (‡∏Å‡∏±‡∏ô‡∏û‡∏±‡∏á: ‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå/‡∏ä‡∏ô‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö‡∏ï‡∏≠‡∏ô‡πÄ‡∏ó‡∏£‡∏ô)
# -------------------------------------------------
def predict_one(sample_dict: dict):
    """
    ‡∏£‡∏±‡∏ö dict 1 ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£ (‡∏Ñ‡∏µ‡∏¢‡πå‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå X ‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡πÄ‡∏ó‡∏£‡∏ô)
    ‡∏Ñ‡∏∑‡∏ô: (label_text, proba_by_class: dict)
    """
    sample_df = pd.DataFrame([sample_dict])

    # ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö X ‡πÅ‡∏•‡∏∞‡πÄ‡∏ï‡∏¥‡∏°‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ó‡∏µ‡πà‡∏´‡∏≤‡∏¢‡πÑ‡∏õ
    sample_df = sample_df.reindex(columns=X.columns)

    # ‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡∏ä‡∏ô‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á
    for col in numeric_cols:
        sample_df[col] = pd.to_numeric(sample_df[col], errors="coerce")
    for col in categorical_cols:
        # ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô NaN ‡πÉ‡∏´‡πâ‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô string "nan" ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ scaler/encoder ‡∏•‡πâ‡∏°
        sample_df[col] = sample_df[col].astype(str)

    pred_num   = inference_model.predict(sample_df)[0]
    pred_label = le.inverse_transform([pred_num])[0]
    proba      = inference_model.predict_proba(sample_df)[0]
    return pred_label, dict(zip(le.classes_, map(float, proba)))

# -------------------------------------------------
# 13) ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô + ‡∏ö‡∏•‡πá‡∏≠‡∏Å DEBUG
# -------------------------------------------------
ex = X.iloc[0].to_dict()          # ‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏ä‡πâ X_test.iloc[0].to_dict() ‡∏Å‡πá‡πÑ‡∏î‡πâ
label, proba = predict_one(ex)
print("\nüß™ Example prediction:", label, proba)

# (‡∏ñ‡πâ‡∏≤‡∏ï‡∏¥‡∏î ‡πÉ‡∏´‡πâ‡∏î‡∏π‡∏ö‡∏•‡πá‡∏≠‡∏Å‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏î‡πâ‡∏≤‡∏ô‡∏•‡πà‡∏≤‡∏á)
print("\n[DEBUG] predict_one defined? ->", callable(globals().get("predict_one", None)))
print("[DEBUG] inference predict sanity ->", inference_model.predict(X_test.head(1)))
tmp_before = pd.DataFrame([ex]).reindex(columns=X.columns)
print("[DEBUG] dtypes of X.head(1):\n", X.head(1).dtypes)
print("[DEBUG] dtypes of sample before cast:\n", tmp_before.dtypes)
missing = set(X.columns) - set(tmp_before.columns)
extra   = set(tmp_before.columns) - set(X.columns)
print("[DEBUG] keys mismatch? missing:", missing, " extra:", extra)


‚úÖ ‡πÉ‡∏ä‡πâ‡πÑ‡∏ü‡∏•‡πå: C:\Users\ACER\OneDrive\Desktop\is3\black end\datasetis.csv
   ‚Üí ‡∏≠‡πà‡∏≤‡∏ô‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à‡∏î‡πâ‡∏ß‡∏¢ sep=None, encoding='utf-8-sig'
üéØ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Target: ['Balanced' 'Low_Carb' 'Low_Sodium']
üìä ‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô‡∏Ñ‡∏•‡∏≤‡∏™:
 Diet_Recommendation
Balanced      0.426
Low_Sodium    0.316
Low_Carb      0.258
Name: proportion, dtype: float64
üîñ Target mapping: {'Balanced': 0, 'Low_Carb': 1, 'Low_Sodium': 2}
üîÑ ‡∏à‡∏∞‡πÉ‡∏ä‡πâ SMOTE ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏ù‡∏∂‡∏Å (‡∏û‡∏ö imbalanced-learn)
Fitting 5 folds for each of 12 candidates, totalling 60 fits

üèÜ Best CV F1_macro: 1.0000
üîß Best params: {'model__n_estimators': 150, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_features': None, 'model__max_depth': 20}

üìà Test Accuracy : 1.0
üìä Test F1_macro : 1.0

üìã Classification report:
               precision    recall  f1-score   support

    Balanced       1.00      1.00      1.00       128
    Low_

In [7]:
# ===== XGBoost + WEKA-style summary (‡∏ï‡πà‡∏≠‡∏à‡∏≤‡∏Å Random Forest baseline) =====
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix,
    precision_recall_fscore_support, cohen_kappa_score
)

# 0) ‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á/‡∏ô‡∏≥‡πÄ‡∏Ç‡πâ‡∏≤ XGBoost
try:
    from xgboost import XGBClassifier
except Exception as e:
    raise ImportError("‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á xgboost ‡∏Å‡πà‡∏≠‡∏ô: pip install xgboost") from e

# 1) ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏£‡∏à‡∏≤‡∏Å baseline ‡πÅ‡∏•‡πâ‡∏ß (X, y, le, categorical_cols, numeric_cols, preprocessor)
#    ‡∏à‡∏∞‡πÉ‡∏ä‡πâ‡∏ï‡πà‡∏≠‡∏ó‡∏±‡∏ô‡∏ó‡∏µ; ‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ ‡πÉ‡∏´‡πâ‡πÇ‡∏´‡∏•‡∏î/‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡πÉ‡∏´‡πâ‡πÅ‡∏ö‡∏ö‡∏¢‡πà‡∏≠:
if 'X' not in globals():
    import pandas as pd
    from pathlib import Path
    csv_path = Path(r"C:\Users\ACER\OneDrive\Desktop\s3\data set is.csv")  # ‡πÅ‡∏Å‡πâ‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á
    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    target_col = "Diet_Recommendation"
    y_text = df[target_col]
    X = df.drop(columns=[target_col])
    le = LabelEncoder()
    y = le.fit_transform(y_text)
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    preprocessor = ColumnTransformer(
        [("cat", ohe, categorical_cols), ("num", StandardScaler(), numeric_cols)],
        remainder="drop"
    )

# 2) ‡πÅ‡∏ö‡πà‡∏á‡∏ä‡∏∏‡∏î (‡∏Ñ‡∏á random_state/stratify ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏±‡∏ö baseline ‡πÑ‡∏î‡πâ‡∏¢‡∏∏‡∏ï‡∏¥‡∏ò‡∏£‡∏£‡∏°)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# 3) ‡πÇ‡∏°‡πÄ‡∏î‡∏• XGBoost + Pipeline
xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=len(np.unique(y)),
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42,
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
)
xgb_pipe = Pipeline([("prep", preprocessor), ("model", xgb)])

# 4) ‡πÄ‡∏ó‡∏£‡∏ô
xgb_pipe.fit(X_train, y_train)

# 5) ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢
y_pred = xgb_pipe.predict(X_test)

# 6) ===== WEKA-style SUMMARY (‡∏Ñ‡∏£‡∏ö‡∏ó‡∏∏‡∏Å‡∏´‡∏±‡∏ß‡∏Ç‡πâ‡∏≠) =====
# ‡∏ó‡∏≥ y_true/y_pred ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô Series ‡∏ú‡∏π‡∏Å index ‡∏Å‡∏±‡∏ö X_test ‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡∏£‡∏≠‡∏á unknown classes
classes = list(le.classes_)
y_true_s = pd.Series(y_test, index=X_test.index)       # ‡πÄ‡∏õ‡πá‡∏ô label-encoded (‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏Ç)
y_pred_s = pd.Series(y_pred, index=X_test.index)
mask = y_true_s.notna() & y_true_s.isin(range(len(classes)))  # ignore class unknown instances

y_true = y_true_s.loc[mask].to_numpy()
y_pred = y_pred_s.loc[mask].to_numpy()
X_eval = X_test.loc[mask]

# ‡πÄ‡∏°‡∏ï‡∏£‡∏¥‡∏Å‡∏´‡∏•‡∏±‡∏Å
acc = accuracy_score(y_true, y_pred)
f1m = f1_score(y_true, y_pred, average="macro")
kappa = cohen_kappa_score(y_true, y_pred)
print("\n===== XGBoost (‡∏ï‡πà‡∏≠‡∏à‡∏≤‡∏Å Baseline) =====")
print(f"Accuracy: {acc:.4f}")
print(f"F1_macro: {f1m:.4f}")
print(f"Kappa: {kappa:.4f}")
print(f"Total number of instances (after ignore): {len(y_true)}")

# Detailed accuracy by class
prec, rec, f1, sup = precision_recall_fscore_support(
    y_true, y_pred, labels=range(len(classes)), zero_division=0
)
print("\nüìä Detailed Accuracy By Class")
print(f"{'Class':<20}{'Precision':>10}{'Recall':>10}{'F1-Score':>10}{'Support':>10}")
print("-"*60)
for i, cls in enumerate(classes):
    print(f"{cls:<20}{prec[i]:>10.3f}{rec[i]:>10.3f}{f1[i]:>10.3f}{int(sup[i]):>10}")

# Confusion matrix (‡πÄ‡∏•‡∏Ç‡∏î‡∏¥‡∏ö)
cm = confusion_matrix(y_true, y_pred, labels=range(len(classes)))
print("\nüß© Confusion Matrix (rows=true, cols=predicted)")
print(" " * 12 + "  ".join([f"{c:>10}" for c in classes]))
for i, cls in enumerate(classes):
    row_str = "  ".join([f"{n:>10}" for n in cm[i]])
    print(f"{cls:<12}{row_str}")

# Correct / Incorrect
correct = int((y_true == y_pred).sum())
total = int(len(y_true))
incorrect = total - correct
print(f"\n‚úÖ Correctly classified instances:   {correct} / {total}  ({correct/total*100:.2f}%)")
print(f"‚ùå Incorrectly classified instances: {incorrect} / {total}  ({incorrect/total*100:.2f}%)")

# MAE / RMSE (probability-based) ‡πÅ‡∏•‡∏∞ RAE / RRSE (baseline = prior ‡∏Ç‡∏≠‡∏á‡∏ä‡∏∏‡∏î‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô)
proba = xgb_pipe.predict_proba(X_eval)                 # [N, n_classes]
y_onehot = np.eye(len(classes))[y_true]                # one-hot ‡∏Ç‡∏≠‡∏á‡∏õ‡πâ‡∏≤‡∏¢‡∏à‡∏£‡∏¥‡∏á

abs_err = np.abs(y_onehot - proba).sum(axis=1) / 2.0
mae = abs_err.mean()
sq_err = ((y_onehot - proba) ** 2).sum(axis=1) / 2.0
rmse = np.sqrt(sq_err.mean())
print(f"\nüìê Mean absolute error (MAE): {mae:.6f}")
print(f"üìê Root mean squared error (RMSE): {rmse:.6f}")

prior_counts = pd.Series(y_true).value_counts().reindex(range(len(classes)), fill_value=0).values
prior_dist = prior_counts / prior_counts.sum()
abs_err_base = np.abs(y_onehot - prior_dist).sum(axis=1) / 2.0
mae_base = abs_err_base.mean()
sq_err_base = ((y_onehot - prior_dist) ** 2).sum(axis=1) / 2.0
rmse_base = np.sqrt(sq_err_base.mean())

rae  = (mae / mae_base) * 100.0 if mae_base > 0 else float("inf")
rrse = (rmse / rmse_base) * 100.0 if rmse_base > 0 else float("inf")
print(f"üìè Relative absolute error (RAE): {rae:.2f}%")
print(f"üìè Root relative squared error (RRSE): {rrse:.2f}%")



===== XGBoost (‡∏ï‡πà‡∏≠‡∏à‡∏≤‡∏Å Baseline) =====
Accuracy: 1.0000
F1_macro: 1.0000
Kappa: 1.0000
Total number of instances (after ignore): 300

üìä Detailed Accuracy By Class
Class                Precision    Recall  F1-Score   Support
------------------------------------------------------------
Balanced                 1.000     1.000     1.000       128
Low_Carb                 1.000     1.000     1.000        77
Low_Sodium               1.000     1.000     1.000        95

üß© Confusion Matrix (rows=true, cols=predicted)
              Balanced    Low_Carb  Low_Sodium
Balanced           128           0           0
Low_Carb             0          77           0
Low_Sodium           0           0          95

‚úÖ Correctly classified instances:   300 / 300  (100.00%)
‚ùå Incorrectly classified instances: 0 / 300  (0.00%)

üìê Mean absolute error (MAE): 0.002956
üìê Root mean squared error (RMSE): 0.002619
üìè Relative absolute error (RAE): 0.45%
üìè Root relative squared error 