In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import clone
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [4]:
MODEL_NAMES = ["LightGBM", "CatBoost", "XGBoost"]

MODELS = {
    "LightGBM": {
        "lower": lgb.LGBMRegressor(objective="quantile", alpha=0.05, device="gpu", n_estimators=1500,
                                 learning_rate=0.05, num_leaves=60, max_depth=9, min_child_samples=30,
                                 subsample=0.8, random_state=42, n_jobs=8),
        "upper": lgb.LGBMRegressor(objective="quantile", alpha=0.95, device="gpu", n_estimators=1500,
                                 learning_rate=0.05, num_leaves=60, max_depth=9, min_child_samples=30,
                                 subsample=0.8, random_state=42, n_jobs=8)
    },
    "CatBoost": cb.CatBoostRegressor(loss_function="RMSEWithUncertainty", task_type="GPU", iterations=2000,
                                   learning_rate=0.05, depth=8, l2_leaf_reg=5, min_data_in_leaf=30,
                                   verbose=0, random_seed=42, thread_count=4),
    "XGBoost": {
        "lower": xgb.XGBRegressor(objective="reg:quantileerror", quantile_alpha=0.05, tree_method="gpu_hist",
                                n_estimators=1500, learning_rate=0.05, max_depth=8, min_child_weight=50,
                                subsample=0.8, random_state=42, n_jobs=8),
        "upper": xgb.XGBRegressor(objective="reg:quantileerror", quantile_alpha=0.95, tree_method="gpu_hist",
                                n_estimators=1500, learning_rate=0.05, max_depth=8, min_child_weight=50,
                                subsample=0.8, random_state=42, n_jobs=8)
    }
}

In [9]:
def preprocess_data(df):
    df['sale_date'] = pd.to_datetime(df['sale_date'])
    df['sale_year'] = df['sale_date'].dt.year
    df['sale_month'] = df['sale_date'].dt.month

    df['total_val'] = df['land_val'] + df['imp_val']
    df['land_ratio'] = df['land_val'] / df['total_val']
    df['imp_ratio'] = df['imp_val'] / df['total_val']
    df['total_baths'] = df['bath_full'] + 0.75*df['bath_3qtr'] + 0.5*df['bath_half']
    df['sqft_area'] = df['sqft'] + df['sqft_fbsmt']
    df['house_age'] = df['sale_year'] - df['year_built']
    df['years_since_reno'] = np.where(df['year_reno'] > 0, df['sale_year'] - df['year_reno'], 0)

    df['view_score'] = df[['view_rainier', 'view_olympics', 'view_cascades', 'view_territorial',
                          'view_skyline', 'view_sound', 'view_lakewash', 'view_lakesamm',
                          'view_otherwater', 'view_other']].sum(axis=1)
    
    df['lat_bin'] = pd.qcut(df['latitude'], q=50, labels=False, duplicates='drop')
    df['lon_bin'] = pd.qcut(df['longitude'], q=50, labels=False, duplicates='drop')
    df['lat_lon_bin'] = df['lat_bin'].astype(str) + '_' + df['lon_bin'].astype(str)
    
    cat_cols = ['sale_warning', 'join_status', 'city', 'zoning', 'lat_lon_bin']
    df[cat_cols] = df[cat_cols].astype('category')
    
    return df.drop(columns=['sale_date', 'id', 'lat_bin', 'lon_bin'])

In [6]:
def winkler_score(y_true, lower, upper, alpha = 0.10):
    width = upper - lower
    below = np.maximum(lower - y_true, 0)
    above = np.maximum(y_true - upper, 0)
    return width + (2/alpha) * (below + above)

In [7]:
def train_and_predict(model, X_train, y_train, X_val):
    if isinstance(model, dict):  # for lgb and xgb
        lower_model = clone(model["lower"])
        lower_model.fit(X_train, y_train)
        lower_pred = lower_model.predict(X_val)
        
        upper_model = clone(model["upper"])
        upper_model.fit(X_train, y_train)
        upper_pred = upper_model.predict(X_val)
        
        return lower_pred, upper_pred
    else:  # for catboost
        model = clone(model)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        means, stds = preds[:, 0], np.sqrt(preds[:, 1])
        return means - 1.645 * stds, means + 1.645 * stds

In [10]:
# load  data
train_df = pd.read_csv('dataset.csv').drop(columns=["submarket", "subdivision", "sale_nbr"])
test_df = pd.read_csv('test.csv').drop(columns=["submarket", "subdivision", "sale_nbr"])

train_df = preprocess_data(train_df)
X_test = preprocess_data(test_df)

X, y = train_df.drop(columns=['sale_price']), train_df['sale_price']

# encode categorical var
cat_cols = ['sale_warning', 'join_status', 'city', 'zoning', 'lat_lon_bin']
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X[cat_cols] = encoder.fit_transform(X[cat_cols])
X_test[cat_cols] = encoder.transform(X_test[cat_cols])

In [11]:
# cross validation
N_FOLDS = 5
pred_lower = {model: np.zeros(len(X)) for model in MODEL_NAMES}
pred_upper = {model: np.zeros(len(X)) for model in MODEL_NAMES}
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

for train_idx, val_idx in tqdm(kf.split(X, y), total=N_FOLDS):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train = y.iloc[train_idx]
    
    for model_name in MODEL_NAMES:
        model = MODELS[model_name]
        lower, upper = train_and_predict(model, X_train, y_train, X_val)
        pred_lower[model_name][val_idx] = lower
        pred_upper[model_name][val_idx] = upper

  0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4827
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 29 dense feature groups (4.88 MB) transferred to GPU in 0.006956 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4827
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[Li



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4820
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 29 dense feature groups (4.88 MB) transferred to GPU in 0.005816 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4820
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[Li



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4827
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 29 dense feature groups (4.88 MB) transferred to GPU in 0.006337 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4827
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[Li



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4826
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 29 dense feature groups (4.88 MB) transferred to GPU in 0.006339 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4826
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[Li



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4824
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 29 dense feature groups (4.88 MB) transferred to GPU in 0.005780 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4824
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[Li



In [13]:
# optimize_ensemble
weights = np.array([0.3, 0.4, 0.3])
best_weights, best_score = weights.copy(), float('inf')
STEP = 100
for _ in tqdm(range(STEP)):
    candidate = np.maximum(weights + 0.1 * (np.random.dirichlet([5] * len(MODEL_NAMES)) - 0.5/3), 0)
    candidate /= candidate.sum()
    new_lower = sum(w * pred_lower[m] for w, m in zip(candidate, MODEL_NAMES))
    new_upper = sum(w * pred_upper[m] for w, m in zip(candidate, MODEL_NAMES))
    score = np.mean(winkler_score(y, new_lower, new_upper))
    
    if score < best_score:
        best_score = score
        best_weights = candidate.copy()
        weights = candidate.copy()
        print(f"best MWIS: {best_score:.4f}, weights: {best_weights}")

  0%|          | 0/100 [00:00<?, ?it/s]

best MWIS: 326958.6038, weights: [0.29404979 0.39437897 0.31157124]
best MWIS: 326950.0907, weights: [0.27483092 0.39566299 0.3295061 ]
best MWIS: 326892.1625, weights: [0.2805227  0.38445556 0.33502174]
best MWIS: 326880.0329, weights: [0.30566937 0.3772745  0.31705614]
best MWIS: 326827.0119, weights: [0.30866216 0.36093681 0.33040103]
best MWIS: 326811.3727, weights: [0.30292448 0.35574325 0.34133227]
best MWIS: 326808.9204, weights: [0.31099193 0.35024653 0.33876154]
best MWIS: 326802.1815, weights: [0.30922208 0.34368974 0.34708818]
best MWIS: 326800.9152, weights: [0.29842684 0.34892945 0.35264371]
best MWIS: 326799.4232, weights: [0.30223646 0.34394172 0.35382182]
best MWIS: 326799.0163, weights: [0.29327037 0.3435912  0.36313843]
best MWIS: 326798.9493, weights: [0.29484352 0.34507272 0.36008376]
best MWIS: 326798.9196, weights: [0.29581849 0.34249529 0.36168622]


In [17]:
# train final model
test_preds = {"lower": {}, "upper": {}}
for model_name in tqdm(MODEL_NAMES):
    model = MODELS[model_name]
    lower, upper = train_and_predict(model, X, y, X_test)
    test_preds["lower"][model_name] = lower
    test_preds["upper"][model_name] = upper

final_lower = sum(best_weights[i] * test_preds["lower"][m] for i, m in enumerate(MODEL_NAMES))
final_upper = sum(best_weights[i] * test_preds["upper"][m] for i, m in enumerate(MODEL_NAMES))
final_lower, final_upper = np.minimum(final_lower, final_upper), np.maximum(final_lower, final_upper)
final_lower = np.maximum(final_lower, 0)

  0%|          | 0/3 [00:00<?, ?it/s]

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4849
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 29 dense feature groups (6.10 MB) transferred to GPU in 0.007273 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 185000.000000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4849
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 52
[LightGBM] [Info] Using GPU Device: Tesla V100S-PCIE-32GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[Li



In [None]:
print(f'{"Model":<10}{"MWIS Score":>15}')
print('-' * 25)
print(f'{"Ensemble":<10}{best_score:>15.4f}')

Model          MWIS Score
-------------------------
Ensemble      326798.9196


In [None]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "pi_lower": final_lower,
    "pi_upper": final_upper
})
submission.to_csv("submission.csv", index=False)