# Introduction:
I feel like removing outliers makes results worse. We are better off just using feature engineering alone, with maybe just setting upper bounds for extreme outlier values.

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import optuna
from sklearn.preprocessing import StandardScaler

In [4]:
from warnings import filterwarnings
filterwarnings("ignore")

# Loading Data

In [5]:
# setting a base path variable for easy access
BASE_PATH = Path("/kaggle/input/playground-series-s3e6")
train = pd.read_csv(BASE_PATH / "train.csv").drop(columns=["id"])

test = pd.read_csv(BASE_PATH / "test.csv")
# we need the test id column to make the submission
test_idx = test.id
test = test.drop(columns=["id"])

# we also load original dataset and will try to gauge if adding it to the competition datasets boosts our score or not
original = pd.read_csv("/kaggle/input/paris-housing-price-prediction/ParisHousing.csv")

### Features Presence Check
Checkingg if the competition dataset and original dataset contain the same features in same order, so we can concatenate them easily. Otherwise we'll have to manually make them consistent

In [6]:
all(original.columns == train.columns)

True

#### Result:
Well both dataset indeed contain the same features in the same order. No need for any manual work!

In [7]:
all_datasets = {"train": train, "test": test,"original": original}

# Setting bounds

In [8]:
for dataset in all_datasets.values():
    dataset["attic"] = dataset.attic.clip(upper=10000, lower=10)
    dataset["floors"] = dataset.floors.clip(upper=100)
    dataset["squareMeters"] = dataset.squareMeters.clip(upper=99999)
    dataset["basement"] = dataset.basement.clip(upper=10000, lower=10)
    dataset["garage"] = dataset.garage.clip(upper=1000, lower=100)
    dataset["made"] = dataset.made.clip(upper=2021)
    dataset["cityCode"] = dataset.cityCode.clip(upper=99999)

# Preprocess

In [9]:
def preprocess(datasets:dict, target:str):
    """Preprocess the data
        datasets: Expects a dict with datasets name as keys and datasets as values
            Valid names for dataets are: [train, test, original]
        
        target: name of the target column
    """
    for dataset_name, dataset in datasets.items():
        if dataset_name == "train":
            y = train[target]
            X = train.drop(columns=[target])
        elif dataset_name == "original":
            y_org = original[target]
            X_org = original.drop(columns=[target])
        elif dataset_name == "test":
            X_test = test.copy()
        
        
    X_combined = pd.concat([X, X_org], axis=0)
    y_combined = pd.concat([y, y_org], axis=0)

    return {"train": {"data": X, "target": y}, "original": {"data": X_org,  "target": y_org},
            "combined": {"data": X_combined, "target": y_combined}, "test": {"data": X_test}}    

# Feature Engineering

In [10]:
#  ------------ We need this for our feature number 8 -----------------
train_copy = all_datasets["train"]
train_copy["block_number"] = train_copy.cityCode // 100
# lets find the average house prices for blocks
avg_house_cost_per_block = train_copy.groupby("block_number")["price"].mean().to_dict()
# ---------------------------------------------------------------------------


for _, dataset in all_datasets.items():
    # FeatIdea #1: guestRooms/numberOfRooms
    dataset["guestRooms_/_numberOfRooms"] = dataset.hasGuestRoom / dataset.numberOfRooms
    
    # FeatIdea #2: Number of non guest rooms: NumOfRooms - GuestRooms
    dataset["non_guest_rooms"] = dataset.numberOfRooms - dataset.hasGuestRoom
    
    # FeatIdea #3: Rooms/Floors: Number of rooms per floor
    dataset["NEW_TEST_FEATURE"] = dataset.numberOfRooms // dataset.floors
    
    # FeatIdea #4: Attic/squareMeters:  Attic size to total house size ratio
    dataset["attic_/_squareMeters"] = dataset.attic / dataset.squareMeters
    
    # FeatIdea #5: Basement/squareMeters: Basement size to total house size ratio
    dataset["basement_/_squareMeters"] = dataset.basement / dataset.squareMeters
    
    # FeatIdea #6. Garage/squareMeters: Garage size to total house size ratio
    dataset["garage_/_squareMeters"] = dataset.garage / dataset.squareMeters
    
    # FeatIdea #7: Grouping zip codes
    dataset["block_number"] = dataset.cityCode // 100
    
    # FeatIdea #8: Add avg price per block as a feature.
    dataset["avg_house_price_in_block"] = dataset.block_number.map(avg_house_cost_per_block)

In [11]:
cols_to_drop = ["cityCode", "block_number"]
for _, dataset in all_datasets.items():
    dataset.drop(columns=cols_to_drop, inplace=True)

# Training models

In [12]:
processed_datasets = preprocess(all_datasets, "price")
X_combined, y_combined = processed_datasets["combined"].values()
X_test = processed_datasets["test"]["data"]


X, y = processed_datasets["train"]["data"], processed_datasets["train"]["target"]
X_org, y_org = processed_datasets["original"]["data"], processed_datasets["original"]["target"]

# Tuning XGBoost

In [13]:
# def objective_xgb(trial, X, y, X_original, y_original):
#     params = {
#         'tree_method': "gpu_hist",
#         'n_estimators': trial.suggest_int('n_estimators', 50, 400),
#         'max_depth': trial.suggest_int('max_depth', 2, 20),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
#         'gamma': trial.suggest_loguniform('gamma', 0.00001, 0.3),
#         'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.05),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0, step=0.05),
#         'early_stopping_rounds': trial.suggest_int("early_stoppig_rounds", 40, 100)
#     }
#     # we're gonna train on the combined dataset but, we'll only calculate the validation score only on comp data

#     N_FOLDS = 5
#     all_scores = np.zeros(N_FOLDS)

#     skf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=1337)

#     for fold_id, (train_idx, val_idx) in enumerate(skf.split(X)):
        
#         X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
#         X_tr = pd.concat([X_tr, X_original], axis=0)
#         y_tr = pd.concat([y_tr, y_original], axis=0)
        
#         model = xgb.XGBRegressor(**params)
#         model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        
#         y_pred = model.predict(X_val)
                
#         rmse = mean_squared_error(y_val, y_pred, squared=False)        
#         all_scores[fold_id] = rmse
    
#     avg_rmse = np.mean(all_scores)
    
#     print(f"Avg RMSE: {avg_rmse}")
    
#     return avg_rmse

# study_xgb = optuna.create_study(study_name="xgboost_tuning", direction="minimize")
# func = lambda trial: objective_xgb(trial, X, y, X_org, y_org)
# study_xgb.optimize(func, n_trials=50)

[32m[I 2023-02-20 19:11:11,942][0m A new study created in memory with name: xgboost_tuning[0m
[32m[I 2023-02-20 19:11:15,249][0m Trial 0 finished with value: 245720.95075908146 and parameters: {'n_estimators': 60, 'max_depth': 5, 'learning_rate': 0.06905483990591715, 'min_child_weight': 13, 'gamma': 0.004579524875897699, 'subsample': 0.45, 'colsample_bytree': 0.6000000000000001, 'early_stoppig_rounds': 95}. Best is trial 0 with value: 245720.95075908146.[0m


Avg RMSE: 245720.95075908146


[32m[I 2023-02-20 19:11:17,726][0m Trial 1 finished with value: 169219.9462123244 and parameters: {'n_estimators': 373, 'max_depth': 3, 'learning_rate': 0.13572208399682192, 'min_child_weight': 11, 'gamma': 0.029081204538277682, 'subsample': 0.8, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 60}. Best is trial 1 with value: 169219.9462123244.[0m


Avg RMSE: 169219.9462123244


[32m[I 2023-02-20 19:13:52,465][0m Trial 2 finished with value: 375173.1221821708 and parameters: {'n_estimators': 184, 'max_depth': 20, 'learning_rate': 0.1803421933978423, 'min_child_weight': 4, 'gamma': 0.000351612535664332, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'early_stoppig_rounds': 91}. Best is trial 1 with value: 169219.9462123244.[0m


Avg RMSE: 375173.1221821708


[32m[I 2023-02-20 19:14:08,416][0m Trial 3 finished with value: 309426.52506476245 and parameters: {'n_estimators': 325, 'max_depth': 12, 'learning_rate': 0.20615840782481032, 'min_child_weight': 8, 'gamma': 4.301341322751574e-05, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.35000000000000003, 'early_stoppig_rounds': 51}. Best is trial 1 with value: 169219.9462123244.[0m


Avg RMSE: 309426.52506476245


[32m[I 2023-02-20 19:16:15,494][0m Trial 4 finished with value: 494898.52974354813 and parameters: {'n_estimators': 279, 'max_depth': 17, 'learning_rate': 0.2403325567095468, 'min_child_weight': 3, 'gamma': 0.002466921826469305, 'subsample': 0.4, 'colsample_bytree': 0.2, 'early_stoppig_rounds': 67}. Best is trial 1 with value: 169219.9462123244.[0m


Avg RMSE: 494898.52974354813


[32m[I 2023-02-20 19:16:19,079][0m Trial 5 finished with value: 168380.96314197715 and parameters: {'n_estimators': 287, 'max_depth': 7, 'learning_rate': 0.19783560518451207, 'min_child_weight': 14, 'gamma': 0.05833609823676724, 'subsample': 0.75, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 76}. Best is trial 5 with value: 168380.96314197715.[0m


Avg RMSE: 168380.96314197715


[32m[I 2023-02-20 19:16:51,960][0m Trial 6 finished with value: 566360.1748287161 and parameters: {'n_estimators': 131, 'max_depth': 20, 'learning_rate': 0.019284943550683616, 'min_child_weight': 9, 'gamma': 0.05105223139186849, 'subsample': 0.65, 'colsample_bytree': 0.45, 'early_stoppig_rounds': 45}. Best is trial 5 with value: 168380.96314197715.[0m


Avg RMSE: 566360.1748287161


[32m[I 2023-02-20 19:19:05,723][0m Trial 7 finished with value: 198501.4840885659 and parameters: {'n_estimators': 327, 'max_depth': 15, 'learning_rate': 0.05480637332444728, 'min_child_weight': 7, 'gamma': 0.00669009397519363, 'subsample': 0.9000000000000001, 'colsample_bytree': 0.45, 'early_stoppig_rounds': 47}. Best is trial 5 with value: 168380.96314197715.[0m


Avg RMSE: 198501.4840885659


[32m[I 2023-02-20 19:21:59,117][0m Trial 8 finished with value: 210742.6887593958 and parameters: {'n_estimators': 236, 'max_depth': 17, 'learning_rate': 0.07838596772590824, 'min_child_weight': 5, 'gamma': 0.2256350513348959, 'subsample': 0.95, 'colsample_bytree': 0.45, 'early_stoppig_rounds': 90}. Best is trial 5 with value: 168380.96314197715.[0m


Avg RMSE: 210742.6887593958


[32m[I 2023-02-20 19:22:31,513][0m Trial 9 finished with value: 163554.83536811703 and parameters: {'n_estimators': 167, 'max_depth': 13, 'learning_rate': 0.028703817331406277, 'min_child_weight': 1, 'gamma': 0.0036833013533367097, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 79}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 163554.83536811703


[32m[I 2023-02-20 19:22:44,526][0m Trial 10 finished with value: 315296.4295315222 and parameters: {'n_estimators': 136, 'max_depth': 10, 'learning_rate': 0.02215692314127578, 'min_child_weight': 1, 'gamma': 0.0003546267004256193, 'subsample': 1.0, 'colsample_bytree': 0.75, 'early_stoppig_rounds': 79}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 315296.4295315222


[32m[I 2023-02-20 19:22:52,131][0m Trial 11 finished with value: 175635.19409231082 and parameters: {'n_estimators': 224, 'max_depth': 8, 'learning_rate': 0.03463236085896126, 'min_child_weight': 15, 'gamma': 0.23142370225027808, 'subsample': 0.75, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 76}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 175635.19409231082


[32m[I 2023-02-20 19:22:56,828][0m Trial 12 finished with value: 960790.962809844 and parameters: {'n_estimators': 168, 'max_depth': 7, 'learning_rate': 0.010531878729925845, 'min_child_weight': 15, 'gamma': 0.02176348204986958, 'subsample': 0.7, 'colsample_bytree': 0.8, 'early_stoppig_rounds': 81}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 960790.962809844


[32m[I 2023-02-20 19:23:02,725][0m Trial 13 finished with value: 167025.38951741843 and parameters: {'n_estimators': 60, 'max_depth': 12, 'learning_rate': 0.10421996710193106, 'min_child_weight': 11, 'gamma': 0.0007698657612256213, 'subsample': 0.8500000000000001, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 67}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 167025.38951741843


[32m[I 2023-02-20 19:23:13,897][0m Trial 14 finished with value: 178854.12086232667 and parameters: {'n_estimators': 54, 'max_depth': 13, 'learning_rate': 0.11468550555129, 'min_child_weight': 11, 'gamma': 0.0008357694419726839, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.75, 'early_stoppig_rounds': 62}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 178854.12086232667


[32m[I 2023-02-20 19:23:20,088][0m Trial 15 finished with value: 251771.46583979507 and parameters: {'n_estimators': 91, 'max_depth': 10, 'learning_rate': 0.036306566589717225, 'min_child_weight': 1, 'gamma': 8.735907157373183e-05, 'subsample': 0.55, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 58}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 251771.46583979507


[32m[I 2023-02-20 19:23:30,671][0m Trial 16 finished with value: 206218.1490732839 and parameters: {'n_estimators': 103, 'max_depth': 14, 'learning_rate': 0.0894166780285831, 'min_child_weight': 11, 'gamma': 1.530301145854112e-05, 'subsample': 0.2, 'colsample_bytree': 0.6000000000000001, 'early_stoppig_rounds': 69}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 206218.1490732839


[32m[I 2023-02-20 19:23:48,533][0m Trial 17 finished with value: 163753.4203736899 and parameters: {'n_estimators': 196, 'max_depth': 11, 'learning_rate': 0.04891808077285906, 'min_child_weight': 6, 'gamma': 0.0011483602131867848, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 85}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 163753.4203736899


[32m[I 2023-02-20 19:24:48,809][0m Trial 18 finished with value: 171464.01047994103 and parameters: {'n_estimators': 203, 'max_depth': 16, 'learning_rate': 0.04387170261150067, 'min_child_weight': 6, 'gamma': 0.008640262304608926, 'subsample': 0.55, 'colsample_bytree': 0.7, 'early_stoppig_rounds': 85}. Best is trial 9 with value: 163554.83536811703.[0m


Avg RMSE: 171464.01047994103


[32m[I 2023-02-20 19:24:59,478][0m Trial 19 finished with value: 158708.59744329026 and parameters: {'n_estimators': 155, 'max_depth': 9, 'learning_rate': 0.05666973996583201, 'min_child_weight': 3, 'gamma': 0.0023987554979290056, 'subsample': 0.65, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 100}. Best is trial 19 with value: 158708.59744329026.[0m


Avg RMSE: 158708.59744329026


[32m[I 2023-02-20 19:25:00,753][0m Trial 20 finished with value: 242278.7959703228 and parameters: {'n_estimators': 150, 'max_depth': 2, 'learning_rate': 0.06108290286200504, 'min_child_weight': 3, 'gamma': 0.0026136225604175183, 'subsample': 0.7, 'colsample_bytree': 0.65, 'early_stoppig_rounds': 99}. Best is trial 19 with value: 158708.59744329026.[0m


Avg RMSE: 242278.7959703228


[32m[I 2023-02-20 19:25:29,449][0m Trial 21 finished with value: 156685.56834408705 and parameters: {'n_estimators': 248, 'max_depth': 11, 'learning_rate': 0.049525887510269956, 'min_child_weight': 2, 'gamma': 0.0017158921880220434, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 87}. Best is trial 21 with value: 156685.56834408705.[0m


Avg RMSE: 156685.56834408705


[32m[I 2023-02-20 19:25:41,566][0m Trial 22 finished with value: 154923.43542581 and parameters: {'n_estimators': 253, 'max_depth': 8, 'learning_rate': 0.029350316563757962, 'min_child_weight': 2, 'gamma': 0.0022962890040330386, 'subsample': 0.65, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 94}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 154923.43542581


[32m[I 2023-02-20 19:25:55,367][0m Trial 23 finished with value: 158227.77565038638 and parameters: {'n_estimators': 256, 'max_depth': 9, 'learning_rate': 0.06447199571292693, 'min_child_weight': 3, 'gamma': 0.0014217639691435841, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 100}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 158227.77565038638


[32m[I 2023-02-20 19:25:58,925][0m Trial 24 finished with value: 155673.81025800674 and parameters: {'n_estimators': 261, 'max_depth': 6, 'learning_rate': 0.08052962782250447, 'min_child_weight': 2, 'gamma': 0.0005007078869219283, 'subsample': 0.4, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 93}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 155673.81025800674


[32m[I 2023-02-20 19:26:01,860][0m Trial 25 finished with value: 155604.37054692121 and parameters: {'n_estimators': 308, 'max_depth': 5, 'learning_rate': 0.08397325597185383, 'min_child_weight': 2, 'gamma': 0.0003082251994472865, 'subsample': 0.35000000000000003, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 90}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 155604.37054692121


[32m[I 2023-02-20 19:26:05,189][0m Trial 26 finished with value: 165108.58386059338 and parameters: {'n_estimators': 318, 'max_depth': 5, 'learning_rate': 0.07912895229383585, 'min_child_weight': 5, 'gamma': 0.00027771195423122576, 'subsample': 0.35000000000000003, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 94}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 165108.58386059338


[32m[I 2023-02-20 19:26:08,262][0m Trial 27 finished with value: 171158.48164709625 and parameters: {'n_estimators': 392, 'max_depth': 5, 'learning_rate': 0.13326008922976013, 'min_child_weight': 2, 'gamma': 0.00014584854123083244, 'subsample': 0.25, 'colsample_bytree': 0.8, 'early_stoppig_rounds': 93}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 171158.48164709625


[32m[I 2023-02-20 19:26:10,733][0m Trial 28 finished with value: 159373.73216027772 and parameters: {'n_estimators': 291, 'max_depth': 4, 'learning_rate': 0.09233702646528157, 'min_child_weight': 4, 'gamma': 0.0005641572097598617, 'subsample': 0.4, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 87}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 159373.73216027772


[32m[I 2023-02-20 19:26:17,172][0m Trial 29 finished with value: 165301.0200748696 and parameters: {'n_estimators': 358, 'max_depth': 6, 'learning_rate': 0.06938213501304409, 'min_child_weight': 2, 'gamma': 0.0001629882139143471, 'subsample': 0.45, 'colsample_bytree': 0.7, 'early_stoppig_rounds': 96}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 165301.0200748696


[32m[I 2023-02-20 19:26:19,415][0m Trial 30 finished with value: 208736.92755347354 and parameters: {'n_estimators': 272, 'max_depth': 2, 'learning_rate': 0.1453369301269449, 'min_child_weight': 4, 'gamma': 0.0005958558626874087, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.55, 'early_stoppig_rounds': 95}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 208736.92755347354


[32m[I 2023-02-20 19:26:22,713][0m Trial 31 finished with value: 157652.5761478444 and parameters: {'n_estimators': 239, 'max_depth': 7, 'learning_rate': 0.29489498921606316, 'min_child_weight': 2, 'gamma': 0.001252477669321163, 'subsample': 0.5, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 89}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 157652.5761478444


[32m[I 2023-02-20 19:26:26,944][0m Trial 32 finished with value: 158261.15029455724 and parameters: {'n_estimators': 254, 'max_depth': 6, 'learning_rate': 0.07375312833171532, 'min_child_weight': 2, 'gamma': 0.004163022815049272, 'subsample': 0.4, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 82}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 158261.15029455724


[32m[I 2023-02-20 19:26:30,149][0m Trial 33 finished with value: 161868.12083153185 and parameters: {'n_estimators': 305, 'max_depth': 4, 'learning_rate': 0.04565112445432115, 'min_child_weight': 1, 'gamma': 0.0017206029451370247, 'subsample': 0.5, 'colsample_bytree': 0.8, 'early_stoppig_rounds': 91}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 161868.12083153185


[32m[I 2023-02-20 19:26:37,003][0m Trial 34 finished with value: 158889.5955213773 and parameters: {'n_estimators': 350, 'max_depth': 8, 'learning_rate': 0.10737387452588151, 'min_child_weight': 4, 'gamma': 0.0003777084501779549, 'subsample': 0.45, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 86}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 158889.5955213773


[32m[I 2023-02-20 19:26:39,295][0m Trial 35 finished with value: 172964.5964480895 and parameters: {'n_estimators': 218, 'max_depth': 4, 'learning_rate': 0.0635492901459273, 'min_child_weight': 5, 'gamma': 0.0009291881413404328, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 72}. Best is trial 22 with value: 154923.43542581.[0m


Avg RMSE: 172964.5964480895


[32m[I 2023-02-20 19:26:42,742][0m Trial 36 finished with value: 153069.70175736025 and parameters: {'n_estimators': 268, 'max_depth': 6, 'learning_rate': 0.08085988097537695, 'min_child_weight': 2, 'gamma': 0.0002391802642514539, 'subsample': 0.55, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 97}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 153069.70175736025


[32m[I 2023-02-20 19:26:44,605][0m Trial 37 finished with value: 157224.02497818897 and parameters: {'n_estimators': 268, 'max_depth': 6, 'learning_rate': 0.15834717201048526, 'min_child_weight': 3, 'gamma': 5.3870341451844695e-05, 'subsample': 0.35000000000000003, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 40}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 157224.02497818897


[32m[I 2023-02-20 19:26:46,728][0m Trial 38 finished with value: 164031.82485123366 and parameters: {'n_estimators': 298, 'max_depth': 3, 'learning_rate': 0.08742577159589214, 'min_child_weight': 7, 'gamma': 0.00021106267851778154, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 98}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 164031.82485123366


[32m[I 2023-02-20 19:27:01,259][0m Trial 39 finished with value: 262841.6426550046 and parameters: {'n_estimators': 341, 'max_depth': 8, 'learning_rate': 0.11404831577793964, 'min_child_weight': 4, 'gamma': 0.00012809103788615297, 'subsample': 0.2, 'colsample_bytree': 0.25, 'early_stoppig_rounds': 92}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 262841.6426550046


[32m[I 2023-02-20 19:27:03,851][0m Trial 40 finished with value: 153648.03980264041 and parameters: {'n_estimators': 313, 'max_depth': 5, 'learning_rate': 0.08143413112890761, 'min_child_weight': 1, 'gamma': 0.0004442215438432291, 'subsample': 0.45, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 96}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 153648.03980264041


[32m[I 2023-02-20 19:27:06,734][0m Trial 41 finished with value: 153818.84315777762 and parameters: {'n_estimators': 321, 'max_depth': 5, 'learning_rate': 0.08163167976939684, 'min_child_weight': 1, 'gamma': 0.0003966319834415326, 'subsample': 0.45, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 96}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 153818.84315777762


[32m[I 2023-02-20 19:27:09,615][0m Trial 42 finished with value: 160578.71323945784 and parameters: {'n_estimators': 377, 'max_depth': 3, 'learning_rate': 0.07003729803227007, 'min_child_weight': 1, 'gamma': 0.00027307042775131103, 'subsample': 0.55, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 97}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 160578.71323945784


[32m[I 2023-02-20 19:27:12,370][0m Trial 43 finished with value: 155805.7085081822 and parameters: {'n_estimators': 312, 'max_depth': 5, 'learning_rate': 0.09620938725316416, 'min_child_weight': 1, 'gamma': 0.0004168204616140537, 'subsample': 0.35000000000000003, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 96}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 155805.7085081822


[32m[I 2023-02-20 19:27:16,253][0m Trial 44 finished with value: 155094.13370981687 and parameters: {'n_estimators': 287, 'max_depth': 7, 'learning_rate': 0.12690948497075977, 'min_child_weight': 1, 'gamma': 0.000801329917477393, 'subsample': 0.45, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 90}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 155094.13370981687


[32m[I 2023-02-20 19:27:20,576][0m Trial 45 finished with value: 156131.12959416836 and parameters: {'n_estimators': 330, 'max_depth': 7, 'learning_rate': 0.18144489138920292, 'min_child_weight': 1, 'gamma': 0.0006197200750334946, 'subsample': 0.65, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 96}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 156131.12959416836


[32m[I 2023-02-20 19:27:43,256][0m Trial 46 finished with value: 238865.26913451785 and parameters: {'n_estimators': 283, 'max_depth': 9, 'learning_rate': 0.12623437494177295, 'min_child_weight': 3, 'gamma': 0.0009388563469833189, 'subsample': 0.45, 'colsample_bytree': 0.35000000000000003, 'early_stoppig_rounds': 89}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 238865.26913451785


[32m[I 2023-02-20 19:27:47,580][0m Trial 47 finished with value: 155508.1358940178 and parameters: {'n_estimators': 279, 'max_depth': 7, 'learning_rate': 0.15889808853312887, 'min_child_weight': 1, 'gamma': 9.789325906863846e-05, 'subsample': 0.5, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 100}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 155508.1358940178


[32m[I 2023-02-20 19:28:01,034][0m Trial 48 finished with value: 182280.33855926702 and parameters: {'n_estimators': 335, 'max_depth': 8, 'learning_rate': 0.10343786650873088, 'min_child_weight': 9, 'gamma': 0.00024293689093412196, 'subsample': 0.7, 'colsample_bytree': 0.5, 'early_stoppig_rounds': 82}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 182280.33855926702


[32m[I 2023-02-20 19:28:04,224][0m Trial 49 finished with value: 180531.56852194102 and parameters: {'n_estimators': 363, 'max_depth': 4, 'learning_rate': 0.11808387289270539, 'min_child_weight': 13, 'gamma': 0.0007089656994741378, 'subsample': 0.55, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 92}. Best is trial 36 with value: 153069.70175736025.[0m


Avg RMSE: 180531.56852194102


In [14]:
study_xgb.best_value

153069.70175736025

In [15]:
study_xgb.best_params

{'n_estimators': 268,
 'max_depth': 6,
 'learning_rate': 0.08085988097537695,
 'min_child_weight': 2,
 'gamma': 0.0002391802642514539,
 'subsample': 0.55,
 'colsample_bytree': 1.0,
 'early_stoppig_rounds': 97}

In [17]:
tuned_xgb_params = {'n_estimators': 268,
                         'max_depth': 6,
                         'learning_rate': 0.08085988097537695,
                         'min_child_weight': 2,
                         'gamma': 0.0002391802642514539,
                         'subsample': 0.55,
                         'colsample_bytree': 1.0,
                         'early_stoppig_rounds': 97}

In [19]:
# we need a validation set, but only of those samples that are from teh competition dataset
X_t, X_val, y_t, y_val = train_test_split(X, y, 
                                          shuffle=True, random_state=1337,
                                         test_size=0.15)

X_combined.drop(X_val.index, inplace=True)
y_combined.drop(y_val.index, inplace=True)

In [20]:
xgb_model = xgb.XGBRegressor(**tuned_xgb_params)
xgb_model.fit(X_t, y_t,
              eval_set=[(X_val, y_val)],
              eval_metric="rmse", verbose=False)

Parameters: { "early_stoppig_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
             early_stoppig_rounds=97, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None,
             gamma=0.0002391802642514539, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.08085988097537695, max_bin=256,
             max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
             min_child_weight=2, missing=nan, monotone_constraints='()',
             n_estimators=268, n_jobs=0, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0, ...)

In [21]:
y_preds = xgb_model.predict(X_test)

In [22]:
submission = pd.DataFrame({'id': test_idx, 'price': y_preds})
submission.head()

Unnamed: 0,id,price
0,22730,4769977.0
1,22731,6227576.0
2,22732,9072999.0
3,22733,1601274.75
4,22734,6754006.5


In [23]:
submission.to_csv("submission.csv", index=False)