In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.base import clone
import lightgbm as lgb
from tqdm import tqdm

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


train_path = "C:\\Users\\USER\\Desktop\\House price\\dataset.csv"
test_path = "C:\\Users\\USER\\Desktop\\House price\\test.csv"
sam_path = "C:\\Users\\USER\\Desktop\\House price\\sample_submission.csv"

print("Libraries installed successfully!")

Libraries installed successfully!


import data

In [240]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_df = pd.read_csv(sam_path)

print(train_df.shape)
print(train_df.columns)
train_df.head()

(200000, 47)
       'join_status', 'join_year', 'latitude', 'longitude', 'area', 'city',
       'zoning', 'subdivision', 'present_use', 'land_val', 'imp_val',
       'year_built', 'year_reno', 'sqft_lot', 'sqft', 'sqft_1', 'sqft_fbsmt',
       'grade', 'fbsmt_grade', 'condition', 'stories', 'beds', 'bath_full',
       'bath_3qtr', 'bath_half', 'garb_sqft', 'gara_sqft', 'wfnt', 'golf',
       'greenbelt', 'noise_traffic', 'view_rainier', 'view_olympics',
       'view_cascades', 'view_territorial', 'view_skyline', 'view_sound',
       'view_lakewash', 'view_lakesamm', 'view_otherwater', 'view_other',
       'submarket'],
      dtype='object')


Unnamed: 0,id,sale_date,sale_price,sale_nbr,sale_warning,join_status,join_year,latitude,longitude,area,...,view_olympics,view_cascades,view_territorial,view_skyline,view_sound,view_lakewash,view_lakesamm,view_otherwater,view_other,submarket
0,0,2014-11-15,236000,2.0,,nochg,2025,47.2917,-122.3658,53,...,0,0,0,0,0,0,0,0,0,I
1,1,1999-01-15,313300,,26.0,nochg,2025,47.6531,-122.1996,74,...,0,0,0,0,0,1,0,0,0,Q
2,2,2006-08-15,341000,1.0,,nochg,2025,47.4733,-122.1901,30,...,0,0,0,0,0,0,0,0,0,K
3,3,1999-12-15,267000,1.0,,nochg,2025,47.4739,-122.3295,96,...,0,0,0,0,0,0,0,0,0,G
4,4,2018-07-15,1650000,2.0,,miss99,2025,47.7516,-122.1222,36,...,0,0,0,0,0,0,0,0,0,P


In [241]:
#計算缺失值
missing_values = train_df.isnull().sum()
missing_values[missing_values > 0].sort_values(ascending=False)

sale_nbr       42182
subdivision    17550
submarket       1717
dtype: int64

feature engineering

In [262]:
# 建立 train_encoded
processed_cols = []
train_encoded = pd.DataFrame()

# One-hot 
confirmed_one_hot = [
    'join_status', 'condition', 'stories', 'grade', 'fbsmt_grade', 'present_use'
]
onehot_df = pd.get_dummies(train_df[confirmed_one_hot], drop_first=False)
train_encoded = pd.concat([train_encoded, onehot_df], axis=1)
processed_cols += confirmed_one_hot


train_df['sale_date'] = pd.to_datetime(train_df['sale_date'], errors='coerce')
train_encoded['sale_year'] = train_df['sale_date'].dt.year
train_encoded['sale_month'] = train_df['sale_date'].dt.month
train_encoded['sale_season'] = ((train_encoded['sale_month'] - 1) // 3 + 1)
processed_cols += ['sale_date']

# 原始數值直接加入
direct_add_cols = [
    'id', 'sale_price', 'join_year', 'latitude', 'longitude',
    'area', 'land_val', 'imp_val', 'year_built', 'year_reno',
    'sqft_lot', 'sqft', 'sqft_1', 'sqft_fbsmt',
    'beds', 'garb_sqft', 'gara_sqft', 'golf', 'greenbelt',

    'bath_full', 'bath_3qtr', 'bath_half', 'wfnt', 'noise_traffic',
    'view_rainier', 'view_olympics', 'view_cascades', 'view_territorial',
    'view_skyline', 'view_sound', 'view_lakewash', 'view_lakesamm',
    'view_otherwater', 'view_other'
    #'subdivision','sale_nbr'  to much missing value
]
for col in direct_add_cols:
    train_encoded[col] = train_df[col]
processed_cols += direct_add_cols

# 統整城市、市場與銷售警告資訊
top_cities = train_df['city'].value_counts().nlargest(10).index.tolist()
top_supermarket = train_df['submarket'].value_counts().nlargest(10).index.tolist()
top_sale_warning = train_df['sale_warning'].value_counts().nlargest(15).index.tolist()

train_encoded['city_simplified'] = train_df['city'].apply(lambda x: x if x in top_cities else 'other')
train_encoded['submarket_simplified'] = train_df['submarket'].apply(lambda x: x if x in top_supermarket else 'other')
train_encoded['sale_warning_simplified'] = train_df['sale_warning'].apply(lambda x: x if x in top_sale_warning else 'other')

city_dummy = pd.get_dummies(train_encoded['city_simplified'], prefix='city', drop_first=False)
submarket_dummy = pd.get_dummies(train_encoded['submarket_simplified'], prefix='submarket', drop_first=False)
sale_warning_dummy = pd.get_dummies(train_encoded['sale_warning_simplified'], prefix='sale_warning', drop_first=False)
train_encoded = pd.concat([train_encoded, city_dummy, submarket_dummy, sale_warning_dummy], axis=1)
#train_encoded = pd.concat([train_encoded, city_dummy], axis=1)
processed_cols += ['city', 'submarket', 'sale_warning']

# Zoning 群組分類
def zoning_group_classify(z):
    if pd.isna(z): return 'other'
    z = z.upper()
    if 'SF' in z: return 'SF'
    elif 'MR' in z: return 'MR'
    elif 'NC' in z: return 'NC'
    elif 'HR' in z or 'IG' in z: return 'other'
    elif 'P' in z: return 'P'
    return 'other'

train_encoded['zoning_group'] = train_df['zoning'].apply(zoning_group_classify)
zoning_dummy = pd.get_dummies(train_encoded['zoning_group'], prefix='zoning_group', drop_first=False)
train_encoded = pd.concat([train_encoded, zoning_dummy], axis=1)
train_encoded.drop(columns=['zoning_group'], inplace=True)
processed_cols += ['zoning']


# 碎片化資訊統整成新欄位
train_encoded['age'] = train_encoded['sale_year'] - train_encoded['year_built']
train_encoded['renovated'] = np.where(train_encoded['year_reno'] > 0, 1, 0)
train_encoded['years_since_reno'] = np.where(train_encoded['renovated'], train_encoded['sale_year'] - train_encoded['year_reno'], 0)
train_encoded['total_baths'] = train_encoded['bath_full'] + 0.75 * train_encoded['bath_3qtr'] + 0.5 * train_encoded['bath_half']
train_encoded['total_value'] = train_encoded['land_val'] + train_encoded['imp_val']
train_encoded['living_area'] = train_encoded['sqft'] + train_encoded['sqft_fbsmt']

# 刪除用完的簡化文字類欄位
for col in ['city_simplified', 'submarket_simplified', 'sale_warning_simplified']:
    train_encoded.drop(columns=[col], inplace=True)

#for col in ['city_simplified']:
#    train_encoded.drop(columns=[col], inplace=True)
#新增特徵
non_zero_lot = train_encoded.loc[train_encoded["sqft_lot"] > 0, "sqft_lot"]
min_val = non_zero_lot.min()
median_val = non_zero_lot.median()

train_encoded["sqft_lot"] = train_encoded["sqft_lot"].replace(0, median_val)

#新增特徵
train_encoded["floor_ratio"] = np.where(
    train_encoded["sqft_lot"] == 0,
    0,
    train_encoded["sqft"] / train_encoded["sqft_lot"]
)

train_encoded["is_large_house"] = (train_encoded["sqft"] > 3000).astype(int)
train_encoded["is_recent_reno"] = (train_encoded["years_since_reno"] <= 5).astype(int)
train_encoded["bath_per_bed"] = train_encoded["total_baths"] / train_encoded["beds"]
train_encoded["bath_per_bed"] = train_encoded["bath_per_bed"].replace([np.inf, -np.inf], 0).fillna(0)


In [None]:
pca_features = ['latitude', 'longitude', 'sqft', 'area', 'total_value', 'imp_val']

# 🔃 標準化 → PCA → KMeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_encoded[pca_features])

pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=10, random_state=42)
train_encoded['pca_region_cluster'] = kmeans.fit_predict(X_pca)

# 🔄 One-hot 編碼
region_dummies = pd.get_dummies(train_encoded['pca_region_cluster'], prefix='pca_region')
train_encoded = pd.concat([train_encoded, region_dummies], axis=1)

train_encoded.drop(columns=['pca_region_cluster'], inplace=True)

  super()._check_params_vs_input(X, default_n_init=10)


In [265]:
def clean_features(df):
    import numpy as np

    # 建議 log1p 處理（避免極端偏態影響模型）
    log_cols = ['land_val', 'imp_val', 'sqft_lot', 'garb_sqft', 'floor_ratio', 'total_value']
    #log_cols = ['land_val', 'imp_val', 'sqft_lot', 'garb_sqft', 'total_value']
    for col in log_cols:
        if col in df.columns:
            df[col] = np.log1p(df[col])

    # clip 上限值（可選，如果你不 log）
    clip_cols = ['land_val', 'imp_val', 'sqft_lot']
    for col in clip_cols:
        if col in df.columns:
            df[col] = df[col].clip(upper=1_000_000)
            
    return df

In [266]:
train_encoded = clean_features(train_encoded)

In [267]:
direct_add_cols = [
    'id', 'join_year', 'latitude', 'longitude',
    'area', 'land_val', 'imp_val', 'year_built', 'year_reno',
    'sqft_lot', 'sqft', 'sqft_1', 'sqft_fbsmt',
    'beds', 'garb_sqft', 'gara_sqft', 'golf', 'greenbelt',

    'bath_full', 'bath_3qtr', 'bath_half',
    'wfnt', 'noise_traffic',
    'view_rainier', 'view_olympics', 'view_cascades', 'view_territorial',
    'view_skyline', 'view_sound', 'view_lakewash', 'view_lakesamm',
    'view_otherwater', 'view_other'
    #'subdivision','sale_nbr'沒有做這個 用意不大
]

In [268]:
# 建立 test_encoded 空表
test_encoded = pd.DataFrame()

# 1. One-hot 欄位
test_onehot_df = pd.get_dummies(test_df[confirmed_one_hot], drop_first=False)
test_encoded = pd.concat([test_encoded, test_onehot_df], axis=1)

# 2. 日期處理
test_df['sale_date'] = pd.to_datetime(test_df['sale_date'], errors='coerce')
test_encoded['sale_year'] = test_df['sale_date'].dt.year
test_encoded['sale_month'] = test_df['sale_date'].dt.month
test_encoded['sale_season'] = ((test_encoded['sale_month'] - 1) // 3 + 1)

# 3. 加入 direct_add_cols 欄位
for col in direct_add_cols:
    test_encoded[col] = test_df[col]

# 4. city / submarket / sale_warning (simplified)
test_encoded['city_simplified'] = test_df['city'].apply(lambda x: x if x in top_cities else 'other')
city_dummy = pd.get_dummies(test_encoded['city_simplified'], prefix='city', drop_first=False)
test_encoded = pd.concat([test_encoded, city_dummy], axis=1)

test_encoded['submarket_simplified'] = test_df['submarket'].apply(lambda x: x if x in top_supermarket else 'other')
submarket_dummy = pd.get_dummies(test_encoded['submarket_simplified'], prefix='submarket', drop_first=False)
test_encoded = pd.concat([test_encoded, submarket_dummy], axis=1)

test_encoded['sale_warning_simplified'] = test_df['sale_warning'].apply(lambda x: x if x in top_sale_warning else 'other')
sale_warning_dummy = pd.get_dummies(test_encoded['sale_warning_simplified'], prefix='sale_warning', drop_first=False)
test_encoded = pd.concat([test_encoded, sale_warning_dummy], axis=1)

# 5. Zoning 分群 One-hot
test_encoded['zoning_group'] = test_df['zoning'].apply(zoning_group_classify)
zoning_dummy = pd.get_dummies(test_encoded['zoning_group'], prefix='zoning_group', drop_first=False)
test_encoded = pd.concat([test_encoded, zoning_dummy], axis=1)
test_encoded.drop(columns=['zoning_group', 'city_simplified', 'submarket_simplified', 'sale_warning_simplified'], inplace=True)
#test_encoded.drop(columns=['zoning_group', 'city_simplified'], inplace=True)

#碎片化資訊統整成新欄位
test_encoded['age'] = test_encoded['sale_year'] - test_encoded['year_built']
test_encoded['renovated'] = np.where(test_encoded['year_reno'] > 0, 1, 0)
test_encoded['years_since_reno'] = np.where(test_encoded['renovated'], test_encoded['sale_year'] - test_encoded['year_reno'], 0)
test_encoded['total_baths'] = test_encoded['bath_full'] + 0.75 * test_encoded['bath_3qtr'] + 0.5 * test_encoded['bath_half']
test_encoded['total_value'] = test_encoded['land_val'] + test_encoded['imp_val']
test_encoded['living_area'] = test_encoded['sqft'] + test_encoded['sqft_fbsmt']

non_zero_lot = test_encoded.loc[test_encoded["sqft_lot"] > 0, "sqft_lot"]
min_val = non_zero_lot.min()
median_val = non_zero_lot.median()

test_encoded["sqft_lot"] = test_encoded["sqft_lot"].replace(0, median_val)


test_encoded["floor_ratio"] = np.where(
    test_encoded["sqft_lot"] == 0,
    0,  # 或其他替代值，例如平均值
    test_encoded["sqft"] / test_encoded["sqft_lot"]
)

test_encoded["is_large_house"] = (test_encoded["sqft"] > 3000).astype(int)
test_encoded["is_recent_reno"] = (test_encoded["years_since_reno"] <= 5).astype(int)
test_encoded["bath_per_bed"] = test_encoded["total_baths"] / test_encoded["beds"]
test_encoded["bath_per_bed"] = test_encoded["bath_per_bed"].replace([np.inf, -np.inf], 0).fillna(0)


In [270]:
pca_features = ['latitude', 'longitude', 'sqft', 'area', 'total_value', 'imp_val']

# 🔃 標準化 → PCA → KMeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_encoded[pca_features])

pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=10, random_state=42)
test_encoded['pca_region_cluster'] = kmeans.fit_predict(X_pca)

# 🔄 One-hot 編碼
region_dummies = pd.get_dummies(test_encoded['pca_region_cluster'], prefix='pca_region')
test_encoded = pd.concat([test_encoded, region_dummies], axis=1)

test_encoded.drop(columns=['pca_region_cluster'], inplace=True)

  super()._check_params_vs_input(X, default_n_init=10)


In [271]:
test_encoded = clean_features(test_encoded)

In [272]:
# 分割訓練特徵與目標
del_train = train_encoded.drop(columns=['sale_price', 'id'])  # id 可留給最後輸出

# 計算每個欄位與 y 的皮爾森相關係數
correlations = del_train.corrwith(train_encoded['sale_price']).abs().sort_values(ascending=False)

In [273]:

def reduce_to_top_features(df, top_features, target_col='sale_price', id_col='id'):
    """
    根據 top_features 保留資料中的前幾重要特徵，加上 id 及 target 欄位（若存在）供後續使用
    """
    cols_to_keep = [col for col in [id_col, target_col] if col in df.columns] + top_features
    return df[cols_to_keep].copy()

In [274]:
# 先取得前 60 個特徵名稱
top_60_features = correlations.head(60).index.tolist()

train_encoded = reduce_to_top_features(train_encoded, top_60_features)
test_encoded = reduce_to_top_features(test_encoded, top_60_features)

In [275]:
#確認資料類型
print(train_encoded.dtypes.value_counts())
print(test_encoded.dtypes.value_counts())

bool       27
int64      22
float64     9
int32       4
Name: count, dtype: int64
bool       27
int64      21
float64     9
int32       4
Name: count, dtype: int64


In [276]:
# train/test 欄位補齊
missing_cols = set(train_encoded.columns) - set(test_encoded.columns) - {"sale_price"}
'''for col in missing_cols:
    test_encoded[col] = 0'''
print(missing_cols)
# ✅ 欄位順序一致化
test_encoded = test_encoded[train_encoded.drop(columns=["sale_price"]).columns]

set()


model(XGBoost)

In [150]:
# 分割訓練特徵與目標
X = train_encoded.drop(columns=['sale_price', 'id'])  # id 可留給最後輸出
y = train_encoded['sale_price']

In [151]:
def winkler_score(y_true, y_lower, y_upper, alpha=0.1):
    interval_width = y_upper - y_lower
    is_covered = (y_true >= y_lower) & (y_true <= y_upper)
    penalty = 2 / alpha * ((y_lower - y_true).clip(lower=0) + (y_true - y_upper).clip(lower=0))
    score = interval_width + penalty
    return score

In [None]:
def oof_and_hill_climb_xgb(X, y, model_lower, model_upper, alpha=0.1, n_splits=5, steps=100, seed=42):
    oof_lowers = np.zeros(len(X))
    oof_uppers = np.zeros(len(X))

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]
        y_lower_train = y_train * (1 - alpha)
        y_upper_train = y_train * (1 + alpha)
        y_val_true = y.iloc[val_idx]
        y_lower_val = y_val_true * (1 - alpha)
        y_upper_val = y_val_true * (1 + alpha)

        # XGBoost clone
        lower = clone(model_lower)
        upper = clone(model_upper)

        lower.fit(X_train, y_lower_train,
                  eval_set=[(X_val, y_lower_val)],
                  verbose=False)
        upper.fit(X_train, y_upper_train,
                  eval_set=[(X_val, y_upper_val)],
                  verbose=False)

        oof_lowers[val_idx] = lower.predict(X_val)
        oof_uppers[val_idx] = upper.predict(X_val)

    # 開始雙權重尋找（Hill Climb）
    best_score = np.inf
    best_weights = (0.4, 0.6)

    for w1 in np.linspace(0.0, 1.0, 25):
        for w2 in np.linspace(0.0, 1.0, 25):
            lower_comb = w1 * oof_lowers + (1 - w1) * oof_uppers
            upper_comb = w2 * oof_uppers + (1 - w2) * oof_lowers
            lower_comb, upper_comb = np.minimum(lower_comb, upper_comb), np.maximum(lower_comb, upper_comb)
            score = np.mean(winkler_score(y, lower_comb, upper_comb, alpha))
            if score < best_score:
                best_score = score
                best_weights = (w1, w2)

    print(f"✅ Best Winkler score: {best_score:.2f}, weights: w1={best_weights[0]:.4f}, w2={best_weights[1]:.4f}")
    return oof_lowers, oof_uppers, best_weights, best_score

In [None]:
model_lower = xgb.XGBRegressor(
    n_estimators=500, max_depth=6, learning_rate=0.01,
    early_stopping_rounds=20, eval_metric='rmse', tree_method='gpu_hist', random_state=42
)

model_upper = xgb.XGBRegressor(
    n_estimators=500, max_depth=6, learning_rate=0.01,
    early_stopping_rounds=20, eval_metric='rmse', tree_method='gpu_hist', random_state=42
)

oof_lowers, oof_uppers, best_weight, best_score = oof_and_hill_climb_xgb(
    X, y, model_lower, model_upper, alpha=0.1, n_splits=5
)

LGBM Quantile

In [277]:
# 分割訓練特徵與目標
X = train_encoded.drop(columns=['sale_price', 'id'])  # id 可留給最後輸出
y = train_encoded['sale_price']

In [278]:
def winkler_score(y_true, lower, upper, alpha=0.1):
    width = upper - lower
    below = np.maximum(lower - y_true, 0)
    above = np.maximum(y_true - upper, 0)
    return width + (2 / alpha) * (below + above)

In [279]:
def oof_and_hill_climb_two_weights(X, y, model_lower, model_upper, alpha=0.1, n_splits=5, seed=42, steps=100 , grid_steps=25):
    oof_lowers = np.zeros(len(X))
    oof_uppers = np.zeros(len(X))

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]

        lower_model = clone(model_lower)
        upper_model = clone(model_upper)

        lower_model.fit(X_train, y_train)
        upper_model.fit(X_train, y_train)

        oof_lowers[val_idx] = lower_model.predict(X_val)
        oof_uppers[val_idx] = upper_model.predict(X_val)

    '''# 初始化雙權重
    current_w1 = 0.4  # 下限 weight
    current_w2 = 0.6  # 上限 weight

    best_score = np.inf
    best_weights = (current_w1, current_w2)'''

    best_score = np.inf
    best_weights = (0.4, 0.6)  # 預設初始點
    grid_range = np.linspace(0.0, 1.0, grid_steps)

    for w1 in grid_range:
        for w2 in grid_range:
            lower_comb = w1 * oof_lowers + (1 - w1) * oof_uppers
            upper_comb = w2 * oof_uppers + (1 - w2) * oof_lowers
            lower_comb, upper_comb = np.minimum(lower_comb, upper_comb), np.maximum(lower_comb, upper_comb)
            score = np.mean(winkler_score(y, lower_comb, upper_comb, alpha))
            if score < best_score:
                best_score = score
                best_weights = (w1, w2)

    current_w1, current_w2 = best_weights
    no_improve_count = 0

    for step in range(steps):
        '''# 微調 perturbation
        dw1, dw2 = np.random.uniform(-0.1, 0.1, size=2)
        w1 = np.clip(current_w1 + dw1, 0.1, 0.9)  # 限制在合理區間
        w2 = np.clip(current_w2 + dw2, 0.1, 0.9)'''

        # 微調 perturbation，讓 weight 有隨機性（避免卡住）
        perturb1 = np.random.dirichlet([9])[0] - 0.9
        perturb2 = np.random.dirichlet([9])[0] - 0.9

        w1 = np.clip(current_w1 + 0.1 * perturb1, 0, 1)
        w2 = np.clip(current_w2 + 0.1 * perturb2, 0, 1)

        # 雙權重組合
        lower_combined = w1 * oof_lowers + (1 - w1) * oof_uppers
        upper_combined = w2 * oof_uppers + (1 - w2) * oof_lowers

        # 修正：確保上下限方向正確（防止預測範圍錯位）
        lower_combined, upper_combined = np.minimum(lower_combined, upper_combined), np.maximum(lower_combined, upper_combined)

        score = np.mean(winkler_score(y, lower_combined, upper_combined, alpha))

        if score < best_score:
            best_score = score
            best_weights = (w1, w2)
            current_w1, current_w2 = w1, w2
            print(f"[Step {step}] ✅ Improved Score: {best_score:.2f} (w1: {w1:.4f}, w2: {w2:.4f})")
        else:
            no_improve_count += 1
            print(f"[Step {step}] no improvement. score = {score:.2f} (w1: {w1:.4f}, w2: {w2:.4f})")

        if no_improve_count > 50:  # 超過 50 步沒進步就提早停
            break

    return oof_lowers, oof_uppers, best_weights, best_score

In [280]:
models = {
    "lower": lgb.LGBMRegressor(
        objective="quantile",
        alpha=0.05,
        device="cpu",
        n_estimators=1500,
        learning_rate=0.05,
        num_leaves=63,
        subsample=0.8,
        subsample_freq=1,
        random_state=42
    ),
    "upper": lgb.LGBMRegressor(
        objective="quantile",
        alpha=0.95,
        device="cpu",
        n_estimators=1500,
        learning_rate=0.05,
        num_leaves=63,
        subsample=0.8,
        subsample_freq=1,
        random_state=42
    )
}

In [281]:
oof_lowers, oof_uppers, (w1, w2), best_score = oof_and_hill_climb_two_weights(
    X, y,
    model_lower=models["lower"],
    model_upper=models["upper"],
    alpha=0.1, 
    n_splits=5,
    steps=120
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3213
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 60
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016802 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3213
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 60
[LightGBM] [Info] Start training from score 1435000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory

before plus floor ratio~bath per bed score = 341590  (original type)
after plus new feature score = 342013
after adjust model = 387816

輸出test xgboost

In [83]:
#填補缺漏欄位（對齊訓練集欄位）
missing_cols = set(X.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0

# 確保欄位順序一致
test_encoded = test_encoded[X.columns]

In [None]:
print(set(X.columns) - set(test_encoded.columns))

set()


In [171]:
test_encoded['id'] = test_df['id']  # 這行先補上 id

In [None]:
# 分割訓練特徵與目標
X = train_encoded.drop(columns=['sale_price', 'id'])  # id 可留給最後輸出
y = train_encoded['sale_price']

In [None]:
final_model_lower = clone(model_lower).fit(X, y * 0.9)
final_model_upper = clone(model_upper).fit(X, y * 1.1)

test_lower = final_model_lower.predict(test_encoded.drop(columns=['id']))
test_upper = final_model_upper.predict(test_encoded.drop(columns=['id']))

w1, w2 = best_weight
final_lower = w1 * test_lower + (1 - w1) * test_upper
final_upper = w2 * test_upper + (1 - w2) * test_lower

final_lower, final_upper = np.minimum(final_lower, final_upper), np.maximum(final_lower, final_upper)
final_lower = np.maximum(final_lower, 0)

In [None]:
submission_df = pd.read_csv('sample_submission.csv')
submission_df.head()


# 建立提交檔
submission_df = pd.DataFrame({
    'id': test_encoded['id'],  # 必須與 sample_submission 對齊
    'pi_lower': final_lower,
    'pi_upper': final_upper
})

# 輸出成 CSV
submission_df.to_csv('xgb_predict2.csv', index=False)
print(submission_df.head())

輸出test LGBM

In [None]:
knn_cols = ['latitude', 'longitude'] + [col for col in X.columns if col.startswith('zoning_group_')]

for col in knn_cols:
    if col not in test_encoded.columns:
        test_encoded[col] = 0

knn_model = KNeighborsRegressor(n_neighbors=15, weights='distance')
knn_model.fit(X[knn_cols], y)

test_encoded['knn_price'] = knn_model.predict(test_encoded[knn_cols])

In [84]:
#填補缺漏欄位（對齊訓練集欄位）
missing_cols = set(X.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0

# 確保欄位順序一致
test_encoded = test_encoded[X.columns]

In [85]:
model_lower = models["lower"]
model_upper = models["upper"]

final_model_lower = clone(model_lower).fit(X, y)
final_model_upper = clone(model_upper).fit(X, y)

'''test_lower = final_model_lower.predict(test_encoded)
test_upper = final_model_upper.predict(test_encoded)

final_lower = best_weight * test_lower
final_upper = best_weight * test_upper

final_lower, final_upper = np.minimum(final_lower, final_upper), np.maximum(final_lower, final_upper)
final_lower = np.maximum(final_lower, 0)'''


test_lower = final_model_lower.predict(test_encoded)
test_upper = final_model_upper.predict(test_encoded)

final_lower = w1 * test_lower + (1 - w1) * test_upper
final_upper = w2 * test_upper + (1 - w2) * test_lower

final_lower, final_upper = np.minimum(final_lower, final_upper), np.maximum(final_lower, final_upper)
final_lower = np.maximum(final_lower, 0)  # optional

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4219
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 105
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 630, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 30 dense feature groups (6.10 MB) transferred to GPU in 0.005310 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4219
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 105
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 630, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bi

In [86]:
submission_df = pd.read_csv('sample_submission.csv')
submission_df.head()
test_encoded['id'] = test_df['id']  # 這行先補上 id

In [87]:
submission_df = pd.DataFrame({
    'id': test_encoded['id'],  # 必須與 sample_submission 對齊
    'pi_lower': final_lower,
    'pi_upper': final_upper
})

# 輸出成 CSV
submission_df.to_csv('lgbm_predict_2.csv', index=False)
print(submission_df.head())

       id       pi_lower      pi_upper
0  200000  810711.497651  1.115071e+06
1  200001  579579.106918  7.605256e+05
2  200002  452341.251537  7.000233e+05
3  200003  334000.297807  4.349514e+05
4  200004  425286.025519  6.336247e+05
