In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.base import clone
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

train_path = "C:\\Users\\USER\\Desktop\\House price\\dataset.csv"
test_path = "C:\\Users\\USER\\Desktop\\House price\\test.csv"
sam_path = "C:\\Users\\USER\\Desktop\\House price\\sample_submission.csv"

print("Libraries installed successfully!")

Libraries installed successfully!


import data

In [95]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_df = pd.read_csv(sam_path)

print(train_df.shape)
print(train_df.columns)
train_df.head()

(200000, 47)
       'join_status', 'join_year', 'latitude', 'longitude', 'area', 'city',
       'zoning', 'subdivision', 'present_use', 'land_val', 'imp_val',
       'year_built', 'year_reno', 'sqft_lot', 'sqft', 'sqft_1', 'sqft_fbsmt',
       'grade', 'fbsmt_grade', 'condition', 'stories', 'beds', 'bath_full',
       'bath_3qtr', 'bath_half', 'garb_sqft', 'gara_sqft', 'wfnt', 'golf',
       'greenbelt', 'noise_traffic', 'view_rainier', 'view_olympics',
       'view_cascades', 'view_territorial', 'view_skyline', 'view_sound',
       'view_lakewash', 'view_lakesamm', 'view_otherwater', 'view_other',
       'submarket'],
      dtype='object')


Unnamed: 0,id,sale_date,sale_price,sale_nbr,sale_warning,join_status,join_year,latitude,longitude,area,...,view_olympics,view_cascades,view_territorial,view_skyline,view_sound,view_lakewash,view_lakesamm,view_otherwater,view_other,submarket
0,0,2014-11-15,236000,2.0,,nochg,2025,47.2917,-122.3658,53,...,0,0,0,0,0,0,0,0,0,I
1,1,1999-01-15,313300,,26.0,nochg,2025,47.6531,-122.1996,74,...,0,0,0,0,0,1,0,0,0,Q
2,2,2006-08-15,341000,1.0,,nochg,2025,47.4733,-122.1901,30,...,0,0,0,0,0,0,0,0,0,K
3,3,1999-12-15,267000,1.0,,nochg,2025,47.4739,-122.3295,96,...,0,0,0,0,0,0,0,0,0,G
4,4,2018-07-15,1650000,2.0,,miss99,2025,47.7516,-122.1222,36,...,0,0,0,0,0,0,0,0,0,P


In [96]:
#計算缺失值
missing_values = train_df.isnull().sum()
missing_values[missing_values > 0].sort_values(ascending=False)

sale_nbr       42182
subdivision    17550
submarket       1717
dtype: int64

feature engineering

In [97]:
# 建立 train_encoded
processed_cols = []
train_encoded = pd.DataFrame()

# One-hot 
confirmed_one_hot = [
    'join_status', 'condition', 'stories', 'grade', 'fbsmt_grade', 'present_use'
]
onehot_df = pd.get_dummies(train_df[confirmed_one_hot], drop_first=False)
train_encoded = pd.concat([train_encoded, onehot_df], axis=1)
processed_cols += confirmed_one_hot


train_df['sale_date'] = pd.to_datetime(train_df['sale_date'], errors='coerce')
train_encoded['sale_year'] = train_df['sale_date'].dt.year
train_encoded['sale_month'] = train_df['sale_date'].dt.month
train_encoded['sale_season'] = ((train_encoded['sale_month'] - 1) // 3 + 1)
processed_cols += ['sale_date']

# 原始數值直接加入
direct_add_cols = [
    'id', 'sale_price', 'join_year', 'latitude', 'longitude',
    'area', 'land_val', 'imp_val', 'year_built', 'year_reno',
    'sqft_lot', 'sqft', 'sqft_1', 'sqft_fbsmt',
    'beds', 'garb_sqft', 'gara_sqft', 'golf', 'greenbelt',

    'bath_full', 'bath_3qtr', 'bath_half', 'wfnt', 'noise_traffic',
    'view_rainier', 'view_olympics', 'view_cascades', 'view_territorial',
    'view_skyline', 'view_sound', 'view_lakewash', 'view_lakesamm',
    'view_otherwater', 'view_other'
    #'subdivision','sale_nbr'  to much missing value
]
for col in direct_add_cols:
    train_encoded[col] = train_df[col]
processed_cols += direct_add_cols

# 統整城市、市場與銷售警告資訊
top_cities = train_df['city'].value_counts().nlargest(10).index.tolist()
top_supermarket = train_df['submarket'].value_counts().nlargest(10).index.tolist()
top_sale_warning = train_df['sale_warning'].value_counts().nlargest(15).index.tolist()

train_encoded['city_simplified'] = train_df['city'].apply(lambda x: x if x in top_cities else 'other')
train_encoded['submarket_simplified'] = train_df['submarket'].apply(lambda x: x if x in top_supermarket else 'other')
train_encoded['sale_warning_simplified'] = train_df['sale_warning'].apply(lambda x: x if x in top_sale_warning else 'other')

city_dummy = pd.get_dummies(train_encoded['city_simplified'], prefix='city', drop_first=False)
submarket_dummy = pd.get_dummies(train_encoded['submarket_simplified'], prefix='submarket', drop_first=False)
sale_warning_dummy = pd.get_dummies(train_encoded['sale_warning_simplified'], prefix='sale_warning', drop_first=False)
train_encoded = pd.concat([train_encoded, city_dummy, submarket_dummy, sale_warning_dummy], axis=1)
processed_cols += ['city', 'submarket', 'sale_warning']

# Zoning 群組分類
def zoning_group_classify(z):
    if pd.isna(z): return 'other'
    z = z.upper()
    if 'SF' in z: return 'SF'
    elif 'MR' in z: return 'MR'
    elif 'NC' in z: return 'NC'
    elif 'HR' in z or 'IG' in z: return 'other'
    elif 'P' in z: return 'P'
    return 'other'

train_encoded['zoning_group'] = train_df['zoning'].apply(zoning_group_classify)
zoning_dummy = pd.get_dummies(train_encoded['zoning_group'], prefix='zoning_group', drop_first=False)
train_encoded = pd.concat([train_encoded, zoning_dummy], axis=1)
train_encoded.drop(columns=['zoning_group'], inplace=True)
processed_cols += ['zoning']


# 碎片化資訊統整成新欄位
train_encoded['age'] = train_encoded['sale_year'] - train_encoded['year_built']
train_encoded['renovated'] = np.where(train_encoded['year_reno'] > 0, 1, 0)
train_encoded['years_since_reno'] = np.where(train_encoded['renovated'], train_encoded['sale_year'] - train_encoded['year_reno'], 0)
train_encoded['total_baths'] = train_encoded['bath_full'] + 0.75 * train_encoded['bath_3qtr'] + 0.5 * train_encoded['bath_half']
train_encoded['total_value'] = train_encoded['land_val'] + train_encoded['imp_val']
train_encoded['living_area'] = train_encoded['sqft'] + train_encoded['sqft_fbsmt']

# 刪除用完的簡化文字類欄位
for col in ['city_simplified', 'submarket_simplified', 'sale_warning_simplified']:
    train_encoded.drop(columns=[col], inplace=True)


#新增特徵
non_zero_lot = train_encoded.loc[train_encoded["sqft_lot"] > 0, "sqft_lot"]
min_val = non_zero_lot.min()
median_val = non_zero_lot.median()

train_encoded["sqft_lot"] = train_encoded["sqft_lot"].replace(0, median_val)

#新增特徵
train_encoded["floor_ratio"] = np.where(
    train_encoded["sqft_lot"] == 0,
    0,
    train_encoded["sqft"] / train_encoded["sqft_lot"]
)

train_encoded["is_large_house"] = (train_encoded["sqft"] > 3000).astype(int)
train_encoded["is_recent_reno"] = (train_encoded["years_since_reno"] <= 5).astype(int)
train_encoded["bath_per_bed"] = train_encoded["total_baths"] / train_encoded["beds"]
train_encoded["bath_per_bed"] = train_encoded["bath_per_bed"].replace([np.inf, -np.inf], 0).fillna(0)

# 屋齡區間
train_encoded["age_bin"] = pd.cut(
    train_encoded["age"],
    bins=[-1, 10, 30, 60, 200],
    labels=["0-10", "11-30", "31-60", "60+"]
)
age_dummies = pd.get_dummies(train_encoded["age_bin"], prefix="age_bin")
train_encoded = pd.concat([train_encoded, age_dummies], axis=1)
train_encoded.drop(columns=["age_bin"], inplace=True)


In [98]:
cluster_features = ['latitude', 'longitude', 'area', 'sqft', 'total_value']

# 標準化
scaler = StandardScaler()
train_cluster_scaled = scaler.fit_transform(train_encoded[cluster_features])

# 建立 KMeans 群組（建議先從 10 群開始）
kmeans = KMeans(n_clusters=10, random_state=42, n_init='auto')
train_encoded['region_cluster'] = kmeans.fit_predict(train_cluster_scaled)

# One-hot 編碼
cluster_ohe = pd.get_dummies(train_encoded['region_cluster'], prefix="region")
train_encoded = pd.concat([train_encoded, cluster_ohe], axis=1)

# 移除原始 cluster id（因為是類別）
train_encoded.drop(columns=['region_cluster'], inplace=True)

In [99]:
pca_features = ['latitude', 'longitude', 'sqft', 'area', 'total_value', 'imp_val']

# 🔃 標準化 → PCA → KMeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_encoded[pca_features])

pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=10, random_state=42)
train_encoded['pca_region_cluster'] = kmeans.fit_predict(X_pca)

# 🔄 One-hot 編碼
region_dummies = pd.get_dummies(train_encoded['pca_region_cluster'], prefix='pca_region')
train_encoded = pd.concat([train_encoded, region_dummies], axis=1)

train_encoded.drop(columns=['pca_region_cluster'], inplace=True)

  super()._check_params_vs_input(X, default_n_init=10)


In [100]:
def clean_features(df):
    import numpy as np

    # 建議 log1p 處理（避免極端偏態影響模型）
    log_cols = ['land_val', 'imp_val', 'sqft_lot', 'garb_sqft', 'floor_ratio', 'total_value']
    for col in log_cols:
        if col in df.columns:
            df[col] = np.log1p(df[col])

    # clip 上限值（可選，如果你不 log）
    clip_cols = ['land_val', 'imp_val', 'sqft_lot']
    for col in clip_cols:
        if col in df.columns:
            df[col] = df[col].clip(upper=1_000_000)
            
    return df

In [101]:
train_encoded = clean_features(train_encoded)

In [102]:
direct_add_cols = [
    'id', 'join_year', 'latitude', 'longitude',
    'area', 'land_val', 'imp_val', 'year_built', 'year_reno',
    'sqft_lot', 'sqft', 'sqft_1', 'sqft_fbsmt',
    'beds', 'garb_sqft', 'gara_sqft', 'golf', 'greenbelt',

    'bath_full', 'bath_3qtr', 'bath_half',
    'wfnt', 'noise_traffic',
    'view_rainier', 'view_olympics', 'view_cascades', 'view_territorial',
    'view_skyline', 'view_sound', 'view_lakewash', 'view_lakesamm',
    'view_otherwater', 'view_other'
    #'subdivision','sale_nbr'沒有做這個 用意不大
]

In [103]:
# 建立 test_encoded 空表
test_encoded = pd.DataFrame()

# 1. One-hot 欄位
test_onehot_df = pd.get_dummies(test_df[confirmed_one_hot], drop_first=False)
test_encoded = pd.concat([test_encoded, test_onehot_df], axis=1)

# 2. 日期處理
test_df['sale_date'] = pd.to_datetime(test_df['sale_date'], errors='coerce')
test_encoded['sale_year'] = test_df['sale_date'].dt.year
test_encoded['sale_month'] = test_df['sale_date'].dt.month
test_encoded['sale_season'] = ((test_encoded['sale_month'] - 1) // 3 + 1)

# 3. 加入 direct_add_cols 欄位
for col in direct_add_cols:
    test_encoded[col] = test_df[col]

# 4. city / submarket / sale_warning (simplified)
test_encoded['city_simplified'] = test_df['city'].apply(lambda x: x if x in top_cities else 'other')
city_dummy = pd.get_dummies(test_encoded['city_simplified'], prefix='city', drop_first=False)
test_encoded = pd.concat([test_encoded, city_dummy], axis=1)

test_encoded['submarket_simplified'] = test_df['submarket'].apply(lambda x: x if x in top_supermarket else 'other')
submarket_dummy = pd.get_dummies(test_encoded['submarket_simplified'], prefix='submarket', drop_first=False)
test_encoded = pd.concat([test_encoded, submarket_dummy], axis=1)

test_encoded['sale_warning_simplified'] = test_df['sale_warning'].apply(lambda x: x if x in top_sale_warning else 'other')
sale_warning_dummy = pd.get_dummies(test_encoded['sale_warning_simplified'], prefix='sale_warning', drop_first=False)
test_encoded = pd.concat([test_encoded, sale_warning_dummy], axis=1)

# 5. Zoning 分群 One-hot
test_encoded['zoning_group'] = test_df['zoning'].apply(zoning_group_classify)
zoning_dummy = pd.get_dummies(test_encoded['zoning_group'], prefix='zoning_group', drop_first=False)
test_encoded = pd.concat([test_encoded, zoning_dummy], axis=1)
test_encoded.drop(columns=['zoning_group', 'city_simplified', 'submarket_simplified', 'sale_warning_simplified'], inplace=True)


#碎片化資訊統整成新欄位
test_encoded['age'] = test_encoded['sale_year'] - test_encoded['year_built']
test_encoded['renovated'] = np.where(test_encoded['year_reno'] > 0, 1, 0)
test_encoded['years_since_reno'] = np.where(test_encoded['renovated'], test_encoded['sale_year'] - test_encoded['year_reno'], 0)
test_encoded['total_baths'] = test_encoded['bath_full'] + 0.75 * test_encoded['bath_3qtr'] + 0.5 * test_encoded['bath_half']
test_encoded['total_value'] = test_encoded['land_val'] + test_encoded['imp_val']
test_encoded['living_area'] = test_encoded['sqft'] + test_encoded['sqft_fbsmt']

non_zero_lot = test_encoded.loc[test_encoded["sqft_lot"] > 0, "sqft_lot"]
min_val = non_zero_lot.min()
median_val = non_zero_lot.median()

test_encoded["sqft_lot"] = test_encoded["sqft_lot"].replace(0, median_val)


test_encoded["floor_ratio"] = np.where(
    test_encoded["sqft_lot"] == 0,
    0,  # 或其他替代值，例如平均值
    test_encoded["sqft"] / test_encoded["sqft_lot"]
)

test_encoded["is_large_house"] = (test_encoded["sqft"] > 3000).astype(int)
test_encoded["is_recent_reno"] = (test_encoded["years_since_reno"] <= 5).astype(int)
test_encoded["bath_per_bed"] = test_encoded["total_baths"] / test_encoded["beds"]
test_encoded["bath_per_bed"] = test_encoded["bath_per_bed"].replace([np.inf, -np.inf], 0).fillna(0)

# 年齡區間分箱
test_encoded["age_bin"] = pd.cut(
    test_encoded["age"],
    bins=[-1, 10, 30, 60, 200],
    labels=["0-10", "11-30", "31-60", "60+"]
)
age_dummies = pd.get_dummies(test_encoded["age_bin"], prefix="age_bin")
test_encoded = pd.concat([test_encoded, age_dummies], axis=1)
test_encoded.drop(columns=["age_bin"], inplace=True)


In [104]:
cluster_features = ['latitude', 'longitude', 'area', 'sqft', 'total_value']

# 標準化
scaler = StandardScaler()
test_cluster_scaled = scaler.fit_transform(test_encoded[cluster_features])

# 建立 KMeans 群組（建議先從 10 群開始）
kmeans = KMeans(n_clusters=10, random_state=42, n_init='auto')
test_encoded['region_cluster'] = kmeans.fit_predict(test_cluster_scaled)

# One-hot 編碼
cluster_ohe = pd.get_dummies(test_encoded['region_cluster'], prefix="region")
test_encoded = pd.concat([test_encoded, cluster_ohe], axis=1)

# 移除原始 cluster id（因為是類別）
test_encoded.drop(columns=['region_cluster'], inplace=True)

In [105]:
pca_features = ['latitude', 'longitude', 'sqft', 'area', 'total_value', 'imp_val']

# 🔃 標準化 → PCA → KMeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_encoded[pca_features])

pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=10, random_state=42)
test_encoded['pca_region_cluster'] = kmeans.fit_predict(X_pca)

# 🔄 One-hot 編碼
region_dummies = pd.get_dummies(test_encoded['pca_region_cluster'], prefix='pca_region')
test_encoded = pd.concat([test_encoded, region_dummies], axis=1)

test_encoded.drop(columns=['pca_region_cluster'], inplace=True)

  super()._check_params_vs_input(X, default_n_init=10)


In [106]:
test_encoded = clean_features(test_encoded)

In [107]:
#確認資料類型
print(train_encoded.dtypes.value_counts())
print(test_encoded.dtypes.value_counts())

bool       75
int64      35
float64    11
int32       6
Name: count, dtype: int64
bool       75
int64      34
float64    11
int32       6
Name: count, dtype: int64


In [108]:
test_numeric = train_encoded.select_dtypes(include=[np.number])

# 檢查是否有 inf 或 NaN
print("has inf:", np.isinf(test_numeric.to_numpy()).any())
print("has NaN:", test_numeric.isnull().any().any())

# 看是哪些欄位出問題
print("inf columns:", test_numeric.columns[np.isinf(test_numeric.to_numpy()).any(axis=0)].tolist())
print("NaN columns:", test_numeric.columns[test_numeric.isnull().any()].tolist())

has inf: False
has NaN: False
inf columns: []
NaN columns: []


stack model to predict the best lower and upper

LGBM Quantile

In [109]:
# 分割訓練特徵與目標
X = train_encoded.drop(columns=['sale_price', 'id'])  # id 可留給最後輸出
y = train_encoded['sale_price']
test_encoded = test_encoded.drop(columns=['id'])

test_encoded = test_encoded[X.columns]

In [110]:
def get_oof_preds(X, y, X_test, model_name='xgb', n_splits=5, random_state=42):
    """
    為指定的 model_name 做上下限 OOF 預測 + test 預測
    傳回 oof_lower, oof_upper, test_lower, test_upper 四組預測結果
    """
    oof_lower = np.zeros(len(X))
    oof_upper = np.zeros(len(X))
    test_preds_lower = []
    test_preds_upper = []

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]

        y_lower_train = y_train * 0.9
        y_upper_train = y_train * 1.1

        # 決定模型
        if model_name == 'xgb':
            model_lower = xgb.XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.01,
                                           eval_metric='rmse', tree_method='gpu_hist', random_state=fold)
            model_upper = clone(model_lower)

        elif model_name == 'lgb':
            model_lower = lgb.LGBMRegressor(n_estimators=500, max_depth=6, learning_rate=0.01,
                                            objective='quantile', alpha=0.05, device='gpu', random_state=fold)
            model_upper = lgb.LGBMRegressor(n_estimators=500, max_depth=6, learning_rate=0.01,
                                            objective='quantile', alpha=0.95, device='gpu', random_state=fold)
            
        elif model_name == 'ridge':
            model_lower = Ridge(alpha=1.0)
            model_upper = Ridge(alpha=1.0)
        else:
            raise ValueError(f"Unknown model_name: {model_name}")

        # 訓練與預測
        model_lower.fit(X_train, y_lower_train)
        model_upper.fit(X_train, y_upper_train)

        oof_lower[val_idx] = model_lower.predict(X_val)
        oof_upper[val_idx] = model_upper.predict(X_val)

        test_preds_lower.append(model_lower.predict(X_test))
        test_preds_upper.append(model_upper.predict(X_test))

    # 對 test 預測平均（n fold）
    test_lower = np.mean(test_preds_lower, axis=0)
    test_upper = np.mean(test_preds_upper, axis=0)

    return oof_lower, oof_upper, test_lower, test_upper

In [111]:
# 為 XGBoost 建立 OOF 預測
xgb_lower_oof, xgb_upper_oof, test_xgb_lower, test_xgb_upper = get_oof_preds(X, y, test_encoded, model_name='xgb')

# 為 LightGBM 建立 OOF 預測
lgb_lower_oof, lgb_upper_oof, test_lgb_lower, test_lgb_upper = get_oof_preds(X, y, test_encoded, model_name='lgb')

# 為 Ridge 建立 OOF 預測
ridge_lower_oof, ridge_upper_oof, test_ridge_lower, test_ridge_upper = get_oof_preds(X, y, test_encoded, model_name='ridge')


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4246
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 125
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 32 dense feature groups (4.88 MB) transferred to GPU in 0.008933 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 166500.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4246
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 125
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry:

In [112]:
def winkler_score(y_true, lower, upper, alpha=0.1):
    width = upper - lower
    below = np.maximum(lower - y_true, 0)
    above = np.maximum(y_true - upper, 0)
    return width + (2 / alpha) * (below + above)

In [113]:
def oof_and_hill_climb_with_meta(X_meta, y, alpha=0.1, n_splits=5, seed=42, steps=100):
    

    n = len(X_meta)
    oof_lowers = np.zeros(n)
    oof_uppers = np.zeros(n)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    # ---- [1] 建立 meta learner（用 LGBM 或 Ridge 都可以） ----
    meta_lower_model = LGBMRegressor(random_state=seed, n_estimators=500, learning_rate=0.05)
    meta_upper_model = LGBMRegressor(random_state=seed, n_estimators=500, learning_rate=0.05)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_meta)):
        X_train, X_val = X_meta.iloc[train_idx], X_meta.iloc[val_idx]
        y_train = y.iloc[train_idx]

        y_lower_train = y_train * 0.9 - 1000
        y_upper_train = y_train * 1.1 + 1000

        lower_model = clone(meta_lower_model)
        upper_model = clone(meta_upper_model)

        lower_model.fit(X_train, y_lower_train)
        upper_model.fit(X_train, y_upper_train)

        oof_lowers[val_idx] = lower_model.predict(X_val)
        oof_uppers[val_idx] = upper_model.predict(X_val)

    # ---- [2] hill climbing 尋找最佳 w1, w2 組合 ----
    current_w1 = 0.4
    current_w2 = 0.6
    best_score = np.inf
    best_weights = (current_w1, current_w2)

    for step in range(steps):
        perturb1 = np.random.dirichlet([9])[0] - 0.9
        perturb2 = np.random.dirichlet([9])[0] - 0.9

        w1 = np.clip(current_w1 + 0.1 * perturb1, 0, 1)
        w2 = np.clip(current_w2 + 0.1 * perturb2, 0, 1)

        lower_combined = w1 * oof_lowers + (1 - w1) * oof_uppers
        upper_combined = w2 * oof_uppers + (1 - w2) * oof_lowers

        lower_combined, upper_combined = np.minimum(lower_combined, upper_combined), np.maximum(lower_combined, upper_combined)

        score = np.mean(winkler_score(y, lower_combined, upper_combined, alpha))

        if score < best_score:
            best_score = score
            best_weights = (w1, w2)
            current_w1, current_w2 = w1, w2
            print(f"[Step {step}] ✅ Improved Score: {score:.2f} (w1: {w1:.4f}, w2: {w2:.4f})")

    return oof_lowers, oof_uppers, best_weights, best_score

In [114]:
X_meta = pd.DataFrame({
    'xgb_lower': xgb_lower_oof,
    'xgb_upper': xgb_upper_oof,
    'lgb_lower': lgb_lower_oof,
    'lgb_upper': lgb_upper_oof,
    'ridge_lower': ridge_lower_oof,
    'ridge_upper': ridge_upper_oof,

    # 平均預測（提升穩定性）
    'avg_lower': (xgb_lower_oof + lgb_lower_oof + ridge_lower_oof) / 3,
    'avg_upper': (xgb_upper_oof + lgb_upper_oof + ridge_upper_oof) / 3,

    # 區間寬度（模型可能學會自動拉寬）
    'interval_width_xgb': xgb_upper_oof - xgb_lower_oof,
    'interval_width_lgb': lgb_upper_oof - lgb_lower_oof,
    'interval_width_ridge': ridge_upper_oof - ridge_lower_oof,

    # 極端判斷（判斷是否有反轉）
    'lower_gt_upper_lgb': (lgb_lower_oof > lgb_upper_oof).astype(int)
})

In [115]:
oof_lowers, oof_uppers, best_weight, best_score = oof_and_hill_climb_with_meta(X_meta, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 11
[LightGBM] [Info] Start training from score 525018.728175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 11
[LightGBM] [Info] Start training from score 643911.778866
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 11
[LightGBM] [In

before plus floor ratio~bath per bed score = 341636.83
after plus new feature score = 342013
after adjust model = 387816

輸出test LGBM

In [15]:
#填補缺漏欄位（對齊訓練集欄位）
missing_cols = set(X.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0

# 確保欄位順序一致
test_encoded = test_encoded[X.columns]

In [None]:
model_lower = models["lower"]
model_upper = models["upper"]

final_model_lower = clone(model_lower).fit(X, y)
final_model_upper = clone(model_upper).fit(X, y)

'''test_lower = final_model_lower.predict(test_encoded)
test_upper = final_model_upper.predict(test_encoded)

final_lower = best_weight * test_lower
final_upper = best_weight * test_upper

final_lower, final_upper = np.minimum(final_lower, final_upper), np.maximum(final_lower, final_upper)
final_lower = np.maximum(final_lower, 0)'''


test_lower = final_model_lower.predict(test_encoded)
test_upper = final_model_upper.predict(test_encoded)

final_lower = w1 * test_lower + (1 - w1) * test_upper
final_upper = w2 * test_upper + (1 - w2) * test_lower

final_lower, final_upper = np.minimum(final_lower, final_upper), np.maximum(final_lower, final_upper)
final_lower = np.maximum(final_lower, 0)  # optional

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3845
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 97
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 630, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 27 dense feature groups (4.27 MB) transferred to GPU in 0.005452 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3845
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 97
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 630, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin 

In [21]:
submission_df = pd.read_csv('sample_submission.csv')
submission_df.head()
test_encoded['id'] = test_df['id']  # 這行先補上 id

In [22]:
submission_df = pd.DataFrame({
    'id': test_encoded['id'],  # 必須與 sample_submission 對齊
    'pi_lower': final_lower,
    'pi_upper': final_upper
})

# 輸出成 CSV
submission_df.to_csv('lgbm_predict.csv', index=False)
print(submission_df.head())

       id       pi_lower      pi_upper
0  200000  823500.777056  1.125383e+06
1  200001  590158.900174  7.270068e+05
2  200002  469229.953050  6.502398e+05
3  200003  315274.697956  4.363184e+05
4  200004  413919.532224  6.498657e+05
