In [3]:
# train_and_save_everything.py
import os
import joblib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# -----------------------
# Configuration
# -----------------------
DATA_PATH = r"C:\Users\shiva\404_not_found\ML Pipeline\Train.csv"  # change if needed
FEATURE_COLUMNS = [
    "PatientID","Age","Gender","BloodGroup","SystolicBP","DiastolicBP",
    "HeartRate","RespiratoryRate","BodyTemperature","SpO2",
    "FastingSugar","RandomSugar","Glucose","Hemoglobin","WBC_Count",
    "RBC_Count","Platelet_Count","BMI","CholesterolTotal","LDL","HDL",
    "Triglycerides","Urea","Creatinine"
]
TARGET_COLUMN = "FinalDiagnosis"   # original text target column (we will map to categories)
SAVED_LABEL_ENCODER = "label_encoder.pkl"
SAVED_NUM_IMPUTER = "num_imputer.pkl"
SAVED_CAT_IMPUTER = "cat_imputer.pkl"
SAVED_IMPUTER_PACKAGE = "imputer.pkl"
SAVED_CAT_ENCODERS = "cat_encoders.pkl"
SAVED_TRAINING_COLUMNS = "training_columns.pkl"
SAVED_MODELS = {
    "xgb": "xgb_best.pkl",
    "lgb": "lgb_best.pkl",
    "cat": "cat_best.pkl",
    "ensemble": "ensemble_3models.pkl"
}

# -----------------------
# 1) Load dataset
# -----------------------
df = pd.read_csv(DATA_PATH)
df = df.dropna(how="all")  # drop fully-empty rows

# -----------------------
# 2) Map FinalDiagnosis -> DiagnosisCategory (coarse labels)
# -----------------------
def map_category(text):
    if not isinstance(text, str):
        return "Other"
    t = text.lower()
    if "diabetes" in t:
        return "Diabetes"
    if "hypertension" in t or "blood pressure" in t:
        return "Hypertension"
    if "infection" in t or "fever" in t:
        return "Infection"
    if "anemia" in t:
        return "Anemia"
    if "lipid" in t or "cholesterol" in t:
        return "Hyperlipidemia"
    if "normal" in t or "stable" in t:
        return "Normal"
    return "Other"

df["DiagnosisCategory"] = df[TARGET_COLUMN].apply(map_category)

# -----------------------
# 3) Prepare X, y
# -----------------------
# keep only requested feature columns (create any missing columns as blank)
for c in FEATURE_COLUMNS:
    if c not in df.columns:
        df[c] = ""

X = df[FEATURE_COLUMNS].copy()
y_text = df["DiagnosisCategory"].copy()

# -----------------------
# 4) Encode target (LabelEncoder)
# -----------------------
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_text)
joblib.dump(label_encoder, SAVED_LABEL_ENCODER)
print("[SAVE] label encoder ->", SAVED_LABEL_ENCODER)
print("Target classes:", label_encoder.classes_)

# -----------------------
# 5) Detect numeric & categorical features (robust)
#    Rule: treat a column as numeric if >=50% values coerce to numbers
# -----------------------
numeric_cols = []
categorical_cols = []
for col in FEATURE_COLUMNS:
    # Don't consider PatientID as numeric feature for modeling (treat as categorical)
    if col == "PatientID":
        categorical_cols.append(col)
        continue
    coerced = pd.to_numeric(X[col], errors="coerce")
    frac_numeric = coerced.notna().mean()
    if frac_numeric >= 0.5:
        numeric_cols.append(col)
    else:
        categorical_cols.append(col)

print("[INFO] numeric_cols:", numeric_cols)
print("[INFO] categorical_cols:", categorical_cols)

# -----------------------
# 6) Fit imputers (median for numeric, most_frequent for categorical)
# -----------------------
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# For fitting, coerce numeric columns to numeric dtype
if numeric_cols:
    X_num = X[numeric_cols].apply(pd.to_numeric, errors="coerce")
    X[numeric_cols] = num_imputer.fit_transform(X_num)
else:
    X_num = pd.DataFrame()

if categorical_cols:
    # ensure categorical strings (fill NaN with empty string for fitting)
    X_cat = X[categorical_cols].astype(str).replace("nan", "")
    X[categorical_cols] = cat_imputer.fit_transform(X_cat)
else:
    X_cat = pd.DataFrame()

joblib.dump(num_imputer, SAVED_NUM_IMPUTER)
joblib.dump(cat_imputer, SAVED_CAT_IMPUTER)
print("[SAVE] num_imputer ->", SAVED_NUM_IMPUTER)
print("[SAVE] cat_imputer ->", SAVED_CAT_IMPUTER)

# Save imputer package with metadata for inference
imputer_package = {
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols,
    "num_imputer": num_imputer,
    "cat_imputer": cat_imputer
}
joblib.dump(imputer_package, SAVED_IMPUTER_PACKAGE)
print("[SAVE] imputer package ->", SAVED_IMPUTER_PACKAGE)

# -----------------------
# 7) Encode categorical columns with LabelEncoder (one per column)
#    (saves encoders so you can apply same transforms at inference)
# -----------------------
cat_encoders = {}
for col in categorical_cols:
    enc = LabelEncoder()
    # convert to string to avoid issues
    X[col] = X[col].astype(str)
    X[col] = enc.fit_transform(X[col])
    cat_encoders[col] = enc

joblib.dump(cat_encoders, SAVED_CAT_ENCODERS)
print("[SAVE] cat_encoders ->", SAVED_CAT_ENCODERS)

# Save training columns order
joblib.dump(FEATURE_COLUMNS, SAVED_TRAINING_COLUMNS)
print("[SAVE] training columns ->", SAVED_TRAINING_COLUMNS)


[SAVE] label encoder -> label_encoder.pkl
Target classes: ['Anemia' 'Diabetes' 'Hyperlipidemia' 'Hypertension' 'Infection' 'Normal'
 'Other']
[INFO] numeric_cols: ['Age', 'SystolicBP', 'DiastolicBP', 'HeartRate', 'RespiratoryRate', 'BodyTemperature', 'SpO2', 'FastingSugar', 'RandomSugar', 'Glucose', 'Hemoglobin', 'WBC_Count', 'RBC_Count', 'Platelet_Count', 'BMI', 'CholesterolTotal', 'LDL', 'HDL', 'Triglycerides', 'Urea', 'Creatinine']
[INFO] categorical_cols: ['PatientID', 'Gender', 'BloodGroup']
[SAVE] num_imputer -> num_imputer.pkl
[SAVE] cat_imputer -> cat_imputer.pkl
[SAVE] imputer package -> imputer.pkl
[SAVE] cat_encoders -> cat_encoders.pkl
[SAVE] training columns -> training_columns.pkl


In [4]:
# -----------------------
# 8) Train / Test split
# -----------------------
n_classes = len(label_encoder.classes_)
stratify_arg = y_encoded if n_classes < len(y_encoded) else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=stratify_arg
)

print(f"[DATA] Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# -----------------------
# 9) Define models (same hyperparams as your original script)
# -----------------------
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
    tree_method="hist",
    use_label_encoder=False
)

lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multiclass",
    num_class=n_classes,
    random_state=42
)

cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function="MultiClass",
    random_seed=42,
    verbose=False
)

# -----------------------
# 10) Fit individual models
# -----------------------
print("\n[TRAIN] Training XGBoost...")
xgb_model.fit(X_train, y_train)

print("[TRAIN] Training LightGBM...")
lgb_model.fit(X_train, y_train)

print("[TRAIN] Training CatBoost...")
cat_model.fit(X_train, y_train)

# Save individual models
joblib.dump(xgb_model, SAVED_MODELS["xgb"])
joblib.dump(lgb_model, SAVED_MODELS["lgb"])
joblib.dump(cat_model, SAVED_MODELS["cat"])
print("[SAVE] individual models ->", list(SAVED_MODELS[c] for c in ("xgb","lgb","cat")))

# -----------------------
# 11) Evaluate individuals
# -----------------------
print("\n=== Individual Model Accuracy ===")
for name, model in [
    ("XGBoost", xgb_model),
    ("LightGBM", lgb_model),
    ("CatBoost", cat_model)
]:
    preds = model.predict(X_test)
    print(f"{name}: {accuracy_score(y_test, preds):.4f}")

# -----------------------
# 12) Soft Voting Ensemble
# -----------------------
ensemble = VotingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("lgb", lgb_model),
        ("cat", cat_model)
    ],
    voting="soft"
)

print("\n[TRAIN] Training Ensemble (soft voting)...")
ensemble.fit(X_train, y_train)

ensemble_preds = ensemble.predict(X_test)
ensemble_acc = accuracy_score(y_test, ensemble_preds)

print("\n=== ENSEMBLE PERFORMANCE ===")
print(f"Ensemble Accuracy: {ensemble_acc:.4f}")
print("\nClassification Report:\n")
print(classification_report(y_test, ensemble_preds, target_names=label_encoder.classes_))

# Save ensemble
joblib.dump(ensemble, SAVED_MODELS["ensemble"])
print("[SAVE] ensemble ->", SAVED_MODELS["ensemble"])

# -----------------------
# 13) Final saved artifacts list
# -----------------------
print("\n✅ Saved artifacts:")
print(" -", SAVED_MODELS["ensemble"])
print(" -", SAVED_MODELS["xgb"])
print(" -", SAVED_MODELS["lgb"])
print(" -", SAVED_MODELS["cat"])
print(" -", SAVED_NUM_IMPUTER)
print(" -", SAVED_CAT_IMPUTER)
print(" -", SAVED_IMPUTER_PACKAGE)
print(" -", SAVED_CAT_ENCODERS)
print(" -", SAVED_LABEL_ENCODER)
print(" -", SAVED_TRAINING_COLUMNS)

[DATA] Train shape: (9280, 24), Test shape: (2320, 24)

[TRAIN] Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[TRAIN] Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1591
[LightGBM] [Info] Number of data points in the train set: 9280, number of used features: 24
[LightGBM] [Info] Start training from score -4.472178
[LightGBM] [Info] Start training from score -1.187585
[LightGBM] [Info] Start training from score -2.423876
[LightGBM] [Info] Start training from score -1.255190
[LightGBM] [Info] Start training from score -5.243797
[LightGBM] [Info] Start training from score -1.517366
[LightGBM] [Info] Start training from score -2.459794
[TRAIN] Training CatBoost...
[SAVE] individual models -> ['xgb_best.pkl', 'lgb_best.pkl', 'cat_best.pkl']

=== Individual Model Accuracy ===
XGBoost: 0.6534
LightGBM: 0.6401
CatBoost: 0.6466

[TRAIN] Training Ensemble (soft voting)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1591
[LightGBM] [Info] Number of data points in the train set: 9280, number of used features: 24
[LightGBM] [Info] Start training from score -4.472178
[LightGBM] [Info] Start training from score -1.187585
[LightGBM] [Info] Start training from score -2.423876
[LightGBM] [Info] Start training from score -1.255190
[LightGBM] [Info] Start training from score -5.243797
[LightGBM] [Info] Start training from score -1.517366
[LightGBM] [Info] Start training from score -2.459794

=== ENSEMBLE PERFORMANCE ===
Ensemble Accuracy: 0.6534

Classification Report:

                precision    recall  f1-score   support

        Anemia       0.55      0.22      0.32        27
      Diabetes       0.75      0.74      0.75       707
Hyperlipidemia       0.40      0.25      0.31       205
  Hypertension       0.65   