In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor


# ----------------------------
# 1) Custom cleaners (inside pipeline)
# ----------------------------
class NHISCleaner(BaseEstimator, TransformerMixin):
    """
    - Drops survey design/weight columns (WT/PSU/STRAT/RECTYPE patterns)
    - Drops ID-like object columns (very high cardinality)
    - Replaces NHIS missing codes with NaN (numeric columns only)
    """
    def _init_(
        self,
        missing_codes=(7, 8, 9, 97, 98, 99, 997, 998, 999),
        drop_patterns=("WT", "PSU", "STRAT", "RECTYPE"),
        id_unique_ratio_threshold=0.98,
        max_id_nunique=5000,
        verbose=False
    ):
        self.missing_codes = missing_codes
        self.drop_patterns = drop_patterns
        self.id_unique_ratio_threshold = id_unique_ratio_threshold
        self.max_id_nunique = max_id_nunique
        self.verbose = verbose

    def fit(self, X, y=None):
        X = X.copy()

        # Drop by name pattern
        pattern_drop = []
        upper_cols = {c: str(c).upper() for c in X.columns}
        for c, cu in upper_cols.items():
            if any(pat in cu for pat in self.drop_patterns):
                pattern_drop.append(c)

        # ID-like object columns
        id_like = []
        n = len(X)
        for c in X.columns:
            if X[c].dtype == "object":
                nunique = X[c].nunique(dropna=True)
                ratio = nunique / max(n, 1)
                if (ratio >= self.id_unique_ratio_threshold) and (nunique <= self.max_id_nunique or ratio > 0.995):
                    id_like.append(c)

        self.drop_cols_ = sorted(set(pattern_drop + id_like))

        if self.verbose:
            print(f"[NHISCleaner] Dropping {len(self.drop_cols_)} columns")
            if len(self.drop_cols_) < 50:
                print("Dropped columns:", self.drop_cols_)

        return self

    def transform(self, X):
        X = X.copy()

        # Drop identified columns
        X = X.drop(columns=self.drop_cols_, errors="ignore")

        # Replace missing codes in numeric columns
        for c in X.columns:
            if pd.api.types.is_numeric_dtype(X[c]):
                X[c] = X[c].replace(list(self.missing_codes), np.nan)

        return X

class DropAllMissing(BaseEstimator, TransformerMixin):
    """Drops columns that are entirely missing (all NaN)."""
    def fit(self, X, y=None):
        self.keep_cols_ = [c for c in X.columns if not X[c].isna().all()]
        self.drop_cols_ = [c for c in X.columns if c not in self.keep_cols_]
        return self

    def transform(self, X):
        return X[self.keep_cols_].copy()


# ----------------------------
# 2) Preprocessor + model pipelines
# ----------------------------
def build_preprocessor():
    numeric_selector = make_column_selector(dtype_include=np.number)
    categorical_selector = make_column_selector(dtype_include=object)

    numeric_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])

    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, numeric_selector),
            ("cat", categorical_pipe, categorical_selector),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )


def build_ridge_pipeline(alpha=1000.0):
    return Pipeline(steps=[
        ("clean", NHISCleaner(verbose=False)),
        ("drop_all_missing", DropAllMissing()),
        ("prep", build_preprocessor()),
        ("model", Ridge(alpha=alpha, random_state=42)),
    ])


def build_gb_pipeline(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=3,
    min_samples_leaf=10,
    subsample=0.8
):
    return Pipeline(steps=[
        ("clean", NHISCleaner(verbose=False)),
        ("drop_all_missing", DropAllMissing()),
        ("prep", build_preprocessor()),
        ("model", GradientBoostingRegressor(
            loss="squared_error",
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            subsample=subsample,
            random_state=42
        )),
    ])



# ----------------------------
# 3) Evaluation helper
# ----------------------------
def evaluate_pipeline(pipe, X, y, name="model"):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_mse = -cross_val_score(pipe, X, y, scoring="neg_mean_squared_error", cv=cv).mean()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    test_mse = mean_squared_error(y_test, pred)

    print(f"\n[{name}] CV MSE:   {cv_mse:.4f}")
    print(f"[{name}] Test MSE: {test_mse:.4f}")
    return cv_mse, test_mse


# ----------------------------
# 4) Main
# ----------------------------
if _name_ == "_main_":
    # Load your Excel
    excel_path = "training_dataset.xlsx"
    df = pd.read_excel(excel_path, sheet_name="train")  # adjust if needed

    target = "WEIGHTLBTC_A"
    if target not in df.columns:
        raise ValueError(f"Target '{target}' not found. Check your sheet name/column names.")

    y = df[target]
    X = df.drop(columns=[target])

    # Baseline Ridge
    ridge_pipe = build_ridge_pipeline(alpha=1000.0)
    evaluate_pipeline(ridge_pipe, X, y, name="Ridge")

    # Base GB
    gb_pipe = build_gb_pipeline()
    evaluate_pipeline(gb_pipe, X, y, name="GradientBoosting (base, L2)")

    # ---- Tune GB (GridSearchCV) ----
    param_grid = {
        "model__n_estimators": [300, 600],
        "model__learning_rate": [0.03, 0.05, 0.1],
        "model__max_depth": [2, 3, 4],
        "model__min_samples_leaf": [10, 20],
        "model__subsample": [0.8, 1.0],
    }

    gb_search = GridSearchCV(
        estimator=gb_pipe,
        param_grid=param_grid,
        scoring="neg_mean_squared_error",
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1,
        verbose=2
    )

    gb_search.fit(X, y)

    print("\n[Tuned GB] Best CV MSE:", -gb_search.best_score_)
    print("[Tuned GB] Best parameters:")
    for k, v in gb_search.best_params_.items():
        print(f"  {k}: {v}")

    # Evaluate the tuned best estimator on a holdout split
    best_gb = gb_search.best_estimator_
    evaluate_pipeline(best_gb, X, y, name="GradientBoosting (tuned, L2)")

NameError: name '_name_' is not defined