In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer, make_column_selector as selector, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.feature_selection import SelectFromModel, SelectPercentile, mutual_info_regression
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from xgboost import XGBRegressor
!pip install catboost
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from joblib import dump, load

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
def rmse(y_true, y_pred): return root_mean_squared_error(y_true, y_pred)

# Tuned params
XGB_TUNED = {
    "n_estimators": 1491, "learning_rate": 0.07704671891646622, "max_depth": 4,
    "min_child_weight": 2, "subsample": 0.8545641645055122, "colsample_bytree": 0.7024273291045295,
    "reg_lambda": 0.01286124731516405, "reg_alpha": 0.00034630370261191863,
    "gamma": 2.1060501876471487e-08, "objective": "reg:squarederror", "tree_method": "hist",
    "random_state": 42, "n_jobs": -1,
}
HUBER_TUNED = {"alpha": 2e-6, "epsilon": 2.58, "max_iter": 5000, "tol": 1e-4, "fit_intercept": True}
CAT_TUNED = {
    "subsample": 0.7, "rsm": 0.9, "random_strength": 1.0, "n_estimators": 1500,
    "learning_rate": 0.032199564518489585, "l2_leaf_reg": 0.8733261623828433, "depth": 6,
    "bootstrap_type": "Bernoulli", "loss_function": "RMSE", "random_state": 42,
    "verbose": 0, "allow_writing_files": False,
}
LGB_TUNED = {
    "n_estimators": 979, "learning_rate": 0.028251215820188597, "num_leaves": 48, "max_depth": 6,
    "min_child_samples": 28, "subsample": 0.97658592351061, "subsample_freq": 3,
    "colsample_bytree": 0.513255655270811, "reg_alpha": 0.00013313002526446513,
    "reg_lambda": 0.0008138626231308923, "min_split_gain": 0.01243125804272925,
    "objective": "regression", "random_state": 42, "n_jobs": -1, "verbosity": -1,
}

# Metrics
def mdape(y_true, y_pred, eps: float = 1e-8) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return float(np.median(np.abs((y_true - y_pred) / denom)) * 100.0)

def metrics_report(name, y_true, y_pred):
    print(f"{name:28s} | RMSE: {rmse(y_true, y_pred):,.2f} | "
          f"MAE: {mean_absolute_error(y_true, y_pred):,.2f} | "
          f"MdAPE: {mdape(y_true, y_pred):,.2f}% | R²: {r2_score(y_true, y_pred):.4f}")

# Drop high missing values columns
def drop_high_missing(frame: pd.DataFrame, threshold=0.40):
    miss_ratio = frame.isna().mean()
    drop_cols = miss_ratio[miss_ratio > threshold].index.tolist()
    return frame.drop(columns=drop_cols, errors='ignore'), drop_cols

# Ordinal map
ORDINAL_MAP = {
    'ExterQual':   {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'ExterCond':   {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'BsmtQual':    {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'BsmtCond':    {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'HeatingQC':   {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'KitchenQual': {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'GarageQual':  {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'GarageCond':  {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
    'BsmtExposure':{'None':0,'No':1,'Mn':2,'Av':3,'Gd':4},
    'BsmtFinType1':{'None':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6},
    'BsmtFinType2':{'None':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6},
    'LandSlope':   {'Gtl':0,'Mod':1,'Sev':2},
    'LotShape':    {'Reg':3,'IR1':2,'IR2':1,'IR3':0},
    'PavedDrive':  {'N':0,'P':1,'Y':2},
    'Functional':  {'Sal':0,'Sev':1,'Maj2':2,'Maj1':3,'Mod':4,'Min2':5,'Min1':6,'Typ':7},
    'Utilities':   {'ELO':0,'NoSeWa':1,'NoSewr':2,'AllPub':3},
}

def apply_ordinal_maps(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col, mapping in ORDINAL_MAP.items():
        if col in df.columns:
            df[col] = df[col].astype('object').fillna('None').map(mapping).astype('float64')
    return df

class QuantileClipper(BaseEstimator, TransformerMixin):
    def __init__(self, q_low=0.005, q_high=0.995):
        self.q_low = q_low; self.q_high = q_high; self.bounds_ = None
    def fit(self, X, y=None):
        X = np.asarray(X, float)
        lo = np.nanquantile(X, self.q_low, axis=0)
        hi = np.nanquantile(X, self.q_high, axis=0)
        self.bounds_ = (lo, np.maximum(hi, lo))
        return self
    def transform(self, X):
        X = np.asarray(X, float)
        lo, hi = self.bounds_
        return np.clip(X, lo, hi)

class RareCategoryGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, min_freq=12, other_label="Other"):
        self.min_freq = min_freq; self.other_label = other_label; self.keep_levels_ = {}
    def fit(self, X, y=None):
        X_ = pd.DataFrame(X).apply(lambda s: s.astype('object'))
        for c in X_.columns:
            vc = X_[c].value_counts(dropna=False)
            self.keep_levels_[c] = set(vc[vc >= self.min_freq].index.astype('object'))
        return self
    def transform(self, X):
        X_ = pd.DataFrame(X).apply(lambda s: s.astype('object').fillna('None'))
        for c in X_.columns:
            keep = self.keep_levels_.get(c, None)
            if keep is not None:
                X_.loc[~X_[c].isin(keep), c] = self.other_label
        return X_



In [None]:
# Preprocessors
def make_preprocessor_tree():
    # column selectors
    num_sel = selector(dtype_include=[np.number])
    cat_sel = selector(dtype_include=['object','string','category','bool'])

    numeric_pipe = Pipeline(steps=[
        ('clip',    QuantileClipper(0.005, 0.995)),
        ('imputer', SimpleImputer(strategy='median')),
    ])

    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    categorical_pipe = Pipeline(steps=[
        ('rare',    RareCategoryGrouper(min_freq=12, other_label='Other')),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe',     ohe),
    ])

    return Pipeline(steps=[
        ('cols', ColumnTransformer(
            transformers=[
                ('num', numeric_pipe, num_sel),
                ('cat', categorical_pipe, cat_sel),
            ],
            verbose_feature_names_out=False
        )),
    ])

def make_preprocessor_linear():
    # column selectors
    num_sel = selector(dtype_include=[np.number])
    cat_sel = selector(dtype_include=['object','string','category','bool'])

    numeric_pipe = Pipeline(steps=[
        ('clip',    QuantileClipper(0.005, 0.995)),
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler',  StandardScaler(with_mean=True, with_std=True)),
    ])

    categorical_pipe = Pipeline(steps=[
        ('rare',    RareCategoryGrouper(min_freq=12, other_label='Other')),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe',     OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ])

    return Pipeline(steps=[
        ('cols', ColumnTransformer(
            transformers=[
                ('num', numeric_pipe, num_sel),
                ('cat', categorical_pipe, cat_sel),
            ],
            remainder='drop',
            verbose_feature_names_out=False
        )),
    ])

def score_func_42(X, y):
    return mutual_info_regression(X, y, random_state=42)
# Pipelines
def make_pipeline_xgb(selector_threshold='0.75*median', xgb_kwargs=None):
    steps = [
        ('prep',   make_preprocessor_tree()),
        ('filter_mi', SelectPercentile(score_func=score_func_42, percentile=80)),
        ('select', SelectFromModel(
            estimator=XGBRegressor(**xgb_kwargs),
            threshold=selector_threshold,
            importance_getter='feature_importances_'
        )),
        ('reg', XGBRegressor(**xgb_kwargs)),
    ]
    return Pipeline(steps)

def make_pipeline_huber(huber_kwargs):
    steps = [
        ('prep', make_preprocessor_linear()),
        ('filter_mi',  SelectPercentile(score_func=score_func_42, percentile=85)),
        ('reg',  HuberRegressor(**huber_kwargs)),
    ]
    return TransformedTargetRegressor(regressor=Pipeline(steps), func=np.log1p, inverse_func=np.expm1)

def make_pipeline_cat(selector_threshold='0.75*median', cat_kwargs=None):
    steps = [
        ('prep',   make_preprocessor_tree()),
        ('filter_mi',  SelectPercentile(score_func=score_func_42, percentile=85)),
        ('select', SelectFromModel(
            estimator=ExtraTreesRegressor(
                n_estimators=800,
                max_depth=8,
                min_samples_leaf=10,
                min_samples_split=20,
                max_features=0.5,
                random_state=42,
                n_jobs=-1
            ),
            threshold=selector_threshold, importance_getter='feature_importances_'
        )),
        ('reg',    CatBoostRegressor(**cat_kwargs)),
    ]
    return Pipeline(steps)

def make_pipeline_lgb(selector_threshold='0.75*median', lgb_kwargs=None):
    steps = [
        ('prep',   make_preprocessor_tree()),
        ('filter_mi',  SelectPercentile(score_func=score_func_42, percentile=80)),
        ('select', SelectFromModel(
            estimator=ExtraTreesRegressor(
                n_estimators=800,
                max_depth=8,
                min_samples_leaf=10,
                min_samples_split=20,
                max_features=0.5,
                random_state=42,
                n_jobs=-1
            ),
            threshold=selector_threshold, importance_getter='feature_importances_'
        )),
        ('reg',    LGBMRegressor(**lgb_kwargs)),
    ]
    return Pipeline(steps)

In [None]:
# CV / Utils
def make_stratified_folds(y, n_splits=5, n_repeats=1, random_state=42, n_bins=10):
    rng = np.random.RandomState(random_state)
    y = np.asarray(y, dtype=float)
    qs = np.linspace(0, 1, n_bins + 1)
    bins = np.unique(np.quantile(y, qs))
    yb = np.digitize(y, bins[1:-1], right=True)
    folds = []
    for _ in range(n_repeats):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rng.randint(0, 10**6))
        for tr, va in skf.split(np.zeros_like(yb), yb):
            folds.append((tr, va))
    return folds

def find_best_weights_grid4(y, a, b, c, d, metric="rmse", grid=41):
    y = np.asarray(y, float)
    a = np.asarray(a, float); b = np.asarray(b, float)
    c = np.asarray(c, float); d = np.asarray(d, float)
    ws = np.linspace(0.0, 1.0, grid)
    best = (0.25, 0.25, 0.25, 0.25); best_score = np.inf
    for wa in ws:
        rem1 = 1.0 - wa
        for wb in ws:
            if wb > rem1: break
            rem2 = rem1 - wb
            for wc in ws:
                if wc > rem2: break
                wd = 1.0 - wa - wb - wc
                if wd < 0: continue
                yhat = wa*a + wb*b + wc*c + wd*d
                sc = rmse(y, yhat)
                if sc < best_score:
                    best_score = sc
                    best = (float(wa), float(wb), float(wc), float(wd))
    return best, float(best_score)

# SegBlend wrapper
class WeightedSegmentedBlend4:
    def __init__(self, xgb_pipe, hub_pipe, cat_pipe, lgb_pipe, edges, weights_per_bin):
        self.xgb = xgb_pipe; self.hub = hub_pipe; self.cat = cat_pipe; self.lgb = lgb_pipe
        self.edges = np.asarray(edges, dtype=float)
        self.w_per_bin = [tuple(map(float, w)) for w in weights_per_bin]

    def fit(self, X, y):
        self.xgb.fit(X, y); self.hub.fit(X, y); self.cat.fit(X, y); self.lgb.fit(X, y)
        return self

    def predict(self, X):
        px = self.xgb.predict(X)
        ph = self.hub.predict(X)
        pc = self.cat.predict(X)
        pl = self.lgb.predict(X)

        proxy = np.median(np.vstack([px, ph, pc, pl]), axis=0)
        bins_test = np.digitize(proxy, self.edges[1:-1], right=True)

        wx = np.array([self.w_per_bin[b][0] for b in bins_test], float)
        wh = np.array([self.w_per_bin[b][1] for b in bins_test], float)
        wc = np.array([self.w_per_bin[b][2] for b in bins_test], float)
        wl = np.array([self.w_per_bin[b][3] for b in bins_test], float)
        return wx * px + wh * ph + wc * pc + wl * pl


In [None]:
# Main
def main():
    df = pd.read_csv('housing_train.csv')
    y = df['SalePrice'].astype(float)

    X_all = df.drop(columns=['SalePrice', 'Id'], errors='ignore')
    X = X_all
    X, dropped_missing = drop_high_missing(X, threshold=0.40)
    X = apply_ordinal_maps(X)
    print(f"Columns dropped (>40% missing): {dropped_missing}")

    # Holdout split
    X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.20, random_state=42)

    # Base models
    xgb = make_pipeline_xgb(selector_threshold='0.75*median', xgb_kwargs=XGB_TUNED)
    hub = make_pipeline_huber(huber_kwargs=HUBER_TUNED)
    cat = make_pipeline_cat(selector_threshold='0.75*median', cat_kwargs=CAT_TUNED)
    lgb = make_pipeline_lgb(selector_threshold='0.75*median', lgb_kwargs=LGB_TUNED)

    # OOF
    folds = make_stratified_folds(y_tr, n_splits=5, n_repeats=1, random_state=42, n_bins=10)
    oof_x = np.empty(len(y_tr), dtype=float); oof_x[:] = np.nan
    oof_h = np.empty(len(y_tr), dtype=float); oof_h[:] = np.nan
    oof_c = np.empty(len(y_tr), dtype=float); oof_c[:] = np.nan
    oof_l = np.empty(len(y_tr), dtype=float); oof_l[:] = np.nan

    for tr_idx, va_idx in folds:
        X_tr_i, X_va_i = X_tr.iloc[tr_idx], X_tr.iloc[va_idx]
        y_tr_i = y_tr.iloc[tr_idx]

        x_i = make_pipeline_xgb(selector_threshold='0.75*median', xgb_kwargs=XGB_TUNED)
        h_i = make_pipeline_huber(huber_kwargs=HUBER_TUNED)
        c_i = make_pipeline_cat(selector_threshold='0.75*median', cat_kwargs=CAT_TUNED)
        l_i = make_pipeline_lgb(selector_threshold='0.75*median', lgb_kwargs=LGB_TUNED)

        oof_x[va_idx] = x_i.fit(X_tr_i, y_tr_i).predict(X_va_i)
        oof_h[va_idx] = h_i.fit(X_tr_i, y_tr_i).predict(X_va_i)
        oof_c[va_idx] = c_i.fit(X_tr_i, y_tr_i).predict(X_va_i)
        oof_l[va_idx] = l_i.fit(X_tr_i, y_tr_i).predict(X_va_i)

    # Segment edges + per-bin weights
    q_edges = np.quantile(y_tr, [0.0, 0.33, 0.66, 1.0])
    bins_tr = np.digitize(y_tr, q_edges[1:-1], right=True)

    w_per_bin = []
    for b in range(3):
        m = (bins_tr == b)
        (wx, wh, wc, wl), _ = find_best_weights_grid4(
            y_tr.values[m], oof_x[m], oof_h[m], oof_c[m], oof_l[m],
            metric="rmse", grid=41
        )
        w_per_bin.append((wx, wh, wc, wl))
    print("[Segmented-4] weights per bin (low, mid, high):",
          [tuple(round(x, 3) for x in w) for w in w_per_bin])
    print()
    # Holdout training & report
    xgb.fit(X_tr, y_tr); hub.fit(X_tr, y_tr); cat.fit(X_tr, y_tr); lgb.fit(X_tr, y_tr)
    metrics_report("XGBoost",  y_va, xgb.predict(X_va))
    metrics_report("Huber",    y_va, hub.predict(X_va))
    metrics_report("CatBoost", y_va, cat.predict(X_va))
    metrics_report("LightGBM", y_va, lgb.predict(X_va))
    seg4 = WeightedSegmentedBlend4(xgb, hub, cat, lgb, q_edges, w_per_bin)
    pred_seg4 = seg4.predict(X_va)
    metrics_report("Segmented Blend x4", y_va, pred_seg4)
    print()

    # Refit on ALL data
    make_pipeline = lambda: (
        make_pipeline_xgb(selector_threshold='0.75*median', xgb_kwargs=XGB_TUNED),
        make_pipeline_huber(huber_kwargs=HUBER_TUNED),
        make_pipeline_cat(selector_threshold='0.75*median', cat_kwargs=CAT_TUNED),
        make_pipeline_lgb(selector_threshold='0.75*median', lgb_kwargs=LGB_TUNED),
    )
    xgb_full, hub_full, cat_full, lgb_full = make_pipeline()
    xgb_full.fit(X, y); hub_full.fit(X, y); cat_full.fit(X, y); lgb_full.fit(X, y)

    # Build a blended model using the full-data refits
    seg4_full = WeightedSegmentedBlend4(
        xgb_pipe=xgb_full,
        hub_pipe=hub_full,
        cat_pipe=cat_full,
        lgb_pipe=lgb_full,
        edges=q_edges,
        weights_per_bin=w_per_bin
    )

    # (Optional) attach a few helpful attributes for sanity checks later
    seg4_full.feature_names_ = X.columns.tolist()
    seg4_full.versions_ = {
        "numpy": np.__version__,
        "pandas": pd.__version__,
    }

    # Save as joblib (compression level 3 is a nice size/speed tradeoff)
    #dump(seg4_full, "XGBxHUBxCBxLGBM_MiniModel_OvrTop20.joblib", compress=3)
    #print("Saved XGBxHUBxCBxLGBM.joblib")

main()

Columns dropped (>40% missing): ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']




[Segmented-4] weights per bin (low, mid, high): [(0.0, 0.5, 0.5, 0.0), (0.0, 0.0, 1.0, 0.0), (0.375, 0.625, 0.0, 0.0)]

XGBoost                      | RMSE: 25,256.61 | MAE: 16,162.49 | MdAPE: 6.37% | R²: 0.9168
Huber                        | RMSE: 20,789.04 | MAE: 13,626.45 | MdAPE: 5.58% | R²: 0.9437
CatBoost                     | RMSE: 25,529.03 | MAE: 15,434.66 | MdAPE: 5.50% | R²: 0.9150




LightGBM                     | RMSE: 27,525.03 | MAE: 15,996.62 | MdAPE: 5.57% | R²: 0.9012




Segmented Blend x4           | RMSE: 20,949.73 | MAE: 13,196.60 | MdAPE: 4.73% | R²: 0.9428

