- [previous file - EDA](2022-03-31_train-test_EDA.ipynb)
- [next tile - models with pre-tuned parameters and ensembles](2022-04-15_ensemble.ipynb)

## imports

In [1]:
from warnings import filterwarnings, simplefilter

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

from lib.model_related import *

sns.set()
filterwarnings("ignore")
simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


## reading data

In [2]:
train_raw = pd.read_parquet("data/2022-04-08_train_pre-model.parquet")
test_raw = pd.read_parquet("data/2022-04-08_test_pre-model.parquet")

train_raw.shape, test_raw.shape

((115367, 30), (34686, 28))

## functions

In [3]:
currency_dict = {
    "2020-10-20": 77.9241,
    "2020-10-19": 77.9644,
    "2020-10-21": 77.7780,
    "2020-10-25": 76.4667,
    "2020-10-24": 76.4667,
    "2020-10-26": 76.4667,
    "2020-09-09": 75.9645,
    "2021-09-27": 73.0081,
    "2021-09-30": 72.7608,
    "2021-09-26": 73.0081,
    "2021-09-28": 72.6613,
    "2021-09-29": 72.5083,
    "2021-10-01": 72.6642,
}


def submit(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv("data/sample_submission.csv")
    submission["price"] = preds
    submission.to_csv(f"{name}.csv", index=False)
    
    
def submit_log(hold_out: pd.DataFrame, model, name="submission"):
    preds = model.predict(hold_out)
    submission = pd.read_csv("data/sample_submission.csv")
    submission["price"] = np.exp(preds)
    submission.to_csv(f"{name}.csv", index=False)

In [4]:
train_raw["price"].sum()

158409758714.0

## encoding

In [5]:
train_raw["train/test"] = "train"
test_raw["train/test"] = "test"

data = train_raw.append(test_raw)
data["ptc"].fillna("Оригинал", inplace=True)

data[data.select_dtypes("object").columns.tolist()] = data[
    data.select_dtypes("object").columns.tolist()
].astype(str)

for col in set(data.select_dtypes(exclude=("object")).columns) - {"price"}:
    data[col] = (
        RobustScaler().fit_transform(data[col].values.reshape(-1, 1)).reshape(-1, 1)
    )

for col in ["model_name"]:
    data[col] = LabelEncoder().fit_transform(data[col].astype("str"))

data = pd.get_dummies(
    data,
    columns=[
        "vehicle_transmission",
        "vendor",
        "brand",
        "fuel_type",
        "body_type",
        "color",
        "ptc",
        "drive",
        "wheel",
        "age_cat",
    ],
)

train = data.loc[data["train/test"] == "train"]

train_jane = train.loc[train["sample"] == "jane"]
train_sokolov = train.loc[train["sample"] == "sokolov"]
train_jane["price"] = train_jane["price"] * 0.86
train = train_jane.append(train_sokolov)

train.drop(columns=["sample", "description", "train/test"], inplace=True)
test = data.loc[data["train/test"] == "test"].drop(
    columns=["sample", "description", "train/test", "price"]
)


In [6]:
train["price"].sum()

151033359642.84

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns="price"), train["price"], random_state = 42, shuffle=True)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((86525, 112), (86525,), (28842, 112), (28842,))

## modelling

In [11]:
lr = LinearRegression().fit(X_train, y_train)
knn = KNeighborsRegressor().fit(X_train, y_train)
lightgbm = LGBMRegressor(random_state=42, silent=True).fit(X_train, y_train)
catboost = CatBoostRegressor(random_state=42, silent=True).fit(X_train, y_train)
rf = RandomForestRegressor(random_state=42).fit(X_train, y_train)
rf_log = RandomForestRegressor(random_state=42).fit(X_train, np.log(y_train))
etr = ExtraTreesRegressor(random_state=42).fit(X_train, y_train)
etr_log = ExtraTreesRegressor(random_state=42).fit(X_train, np.log(y_train))

In [12]:
print("lr", mean_absolute_percentage_error(y_valid, lr.predict(X_valid)))
print("knn", mean_absolute_percentage_error(y_valid, knn.predict(X_valid)))
print("lightgbm", mean_absolute_percentage_error(y_valid, lightgbm.predict(X_valid)))
print("catboost", mean_absolute_percentage_error(y_valid, catboost.predict(X_valid)))
print("rf", mean_absolute_percentage_error(y_valid, rf.predict(X_valid)))
print("rf_log", mean_absolute_percentage_error(y_valid, np.exp(rf_log.predict(X_valid))))
print("etr", mean_absolute_percentage_error(y_valid, etr.predict(X_valid)))
print("etr_log", mean_absolute_percentage_error(y_valid, np.exp(etr.predict(X_valid))))

lr 0.7946185682561607
knn 0.16951330279094057
lightgbm 0.19102466397466208
catboost 0.1584574973807607
rf 0.13804863667174702


## dumb model submission

In [13]:
submit(test, lr, "lr")
submit(test, knn, "knn")
submit(test, lightgbm, "lightgbm")
submit(test, catboost, "catboost")
submit(test, rf, "rf")
submit(test, etr, "etr")
submit_log(test, rf_log, "rf_log")
submit_log(test, etr_log, "etr_log")

In [24]:
submit(test, etr, "etr")
submit_log(test, etr_log, "etr_log")

## model tuning

### lightgbm

[notebook with related experiments](model_LightGBM_optuna.ipynb)

In [17]:
def objective(trial):

    param = {
        "objective": "regression",
        "metric": "mape",
        "learning_rate": trial.suggest_uniform("learning_rate", 0.001, 1.0),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "verbosity": -1,
    }

    gbm = LGBMRegressor(**param, silent=True)
    cv_roc_auc = cross_val_score(gbm, X_train, y_train, cv=8, scoring="neg_mean_absolute_percentage_error", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///LGBMRegressor.db",
    study_name="LGBMRegressor",
    load_if_exists=True,
)
study.optimize(objective, timeout=600, n_trials=10)

[32m[I 2022-04-12 22:35:33,204][0m Using an existing study with name 'LGBMClassifier' instead of creating a new one.[0m





[32m[I 2022-04-12 22:35:47,095][0m Trial 5 finished with value: -0.20253670583254396 and parameters: {'learning_rate': 0.07992724445501892, 'lambda_l1': 1.8270468476213303e-05, 'lambda_l2': 0.012791850966944924, 'num_leaves': 28, 'feature_fraction': 0.7698606143353843, 'bagging_fraction': 0.8695690383435876, 'bagging_freq': 2, 'min_child_samples': 48}. Best is trial 5 with value: -0.20253670583254396.[0m




[32m[I 2022-04-12 22:35:59,590][0m Trial 6 finished with value: -0.257565168367874 and parameters: {'learning_rate': 0.744263685304623, 'lambda_l1': 7.552664420954056e-07, 'lambda_l2': 1.9963207632140944e-05, 'num_leaves': 201, 'feature_fraction': 0.7340966128567723, 'bagging_fraction': 0.6079090066839168, 'bagging_freq': 1, 'min_child_samples': 52}. Best is trial 5 with value: -0.20253670583254396.[0m




[32m[I 2022-04-12 22:36:08,280][0m Trial 7 finished with value: -0.1745456552518454 and parameters: {'learning_rate': 0.4216546930140595, 'lambda_l1': 5.529044942764546e-08, 'lambda_l2': 0.49107163604631077, 'num_leaves': 142, 'feature_fraction': 0.5426396065593337, 'bagging_fraction': 0.6354145080468188, 'bagging_freq': 3, 'min_child_samples': 12}. Best is trial 7 with value: -0.1745456552518454.[0m




[32m[I 2022-04-12 22:36:18,834][0m Trial 8 finished with value: -0.2432717613049581 and parameters: {'learning_rate': 0.7172956964597691, 'lambda_l1': 5.858478690290897e-06, 'lambda_l2': 3.9449322532268776e-08, 'num_leaves': 154, 'feature_fraction': 0.679286165875125, 'bagging_fraction': 0.6779233529792716, 'bagging_freq': 4, 'min_child_samples': 70}. Best is trial 7 with value: -0.1745456552518454.[0m




[32m[I 2022-04-12 22:36:28,581][0m Trial 9 finished with value: -0.1567960005144171 and parameters: {'learning_rate': 0.24273738931459424, 'lambda_l1': 0.0007127314011370048, 'lambda_l2': 1.4991431139899208e-08, 'num_leaves': 129, 'feature_fraction': 0.716472706585253, 'bagging_fraction': 0.9079273070338828, 'bagging_freq': 4, 'min_child_samples': 27}. Best is trial 9 with value: -0.1567960005144171.[0m




[32m[I 2022-04-12 22:36:38,610][0m Trial 10 finished with value: -0.3571169344261295 and parameters: {'learning_rate': 0.9494423379774705, 'lambda_l1': 8.976874193475037e-05, 'lambda_l2': 0.00024184845376432643, 'num_leaves': 160, 'feature_fraction': 0.6342281343794924, 'bagging_fraction': 0.52462564650187, 'bagging_freq': 5, 'min_child_samples': 74}. Best is trial 9 with value: -0.1567960005144171.[0m




[32m[I 2022-04-12 22:36:50,387][0m Trial 11 finished with value: -0.22698765690534695 and parameters: {'learning_rate': 0.773164422222387, 'lambda_l1': 1.8437884236649082, 'lambda_l2': 5.646369546324295e-07, 'num_leaves': 225, 'feature_fraction': 0.6137958863887523, 'bagging_fraction': 0.7728914806928449, 'bagging_freq': 6, 'min_child_samples': 22}. Best is trial 9 with value: -0.1567960005144171.[0m





[32m[I 2022-04-12 22:37:03,343][0m Trial 12 finished with value: -0.25967795465332394 and parameters: {'learning_rate': 0.02716689002528746, 'lambda_l1': 0.655717669029221, 'lambda_l2': 3.8521960471096605e-07, 'num_leaves': 225, 'feature_fraction': 0.7442760753734905, 'bagging_fraction': 0.4972661166651129, 'bagging_freq': 2, 'min_child_samples': 78}. Best is trial 9 with value: -0.1567960005144171.[0m





[32m[I 2022-04-12 22:37:13,613][0m Trial 13 finished with value: -0.15895263105959045 and parameters: {'learning_rate': 0.25542847231451854, 'lambda_l1': 5.5278340137009336e-08, 'lambda_l2': 5.616679588596495e-07, 'num_leaves': 112, 'feature_fraction': 0.7415789372960532, 'bagging_fraction': 0.9710703037180636, 'bagging_freq': 4, 'min_child_samples': 42}. Best is trial 9 with value: -0.1567960005144171.[0m








[32m[I 2022-04-12 22:37:24,776][0m Trial 14 finished with value: -0.25642212193797126 and parameters: {'learning_rate': 0.9889916200650344, 'lambda_l1': 1.31641269168062e-07, 'lambda_l2': 0.001308364765353021, 'num_leaves': 181, 'feature_fraction': 0.814929774720756, 'bagging_fraction': 0.8175936493895224, 'bagging_freq': 6, 'min_child_samples': 21}. Best is trial 9 with value: -0.1567960005144171.[0m


In [18]:
study.best_params

{'bagging_fraction': 0.9079273070338828,
 'bagging_freq': 4,
 'feature_fraction': 0.716472706585253,
 'lambda_l1': 0.0007127314011370048,
 'lambda_l2': 1.4991431139899208e-08,
 'learning_rate': 0.24273738931459424,
 'min_child_samples': 27,
 'num_leaves': 129}

In [19]:
lightgbm_optuned = LGBMRegressor(
    **{
        "bagging_fraction": 0.9079273070338828,
        "bagging_freq": 4,
        "feature_fraction": 0.716472706585253,
        "lambda_l1": 0.0007127314011370048,
        "lambda_l2": 1.4991431139899208e-08,
        "learning_rate": 0.24273738931459424,
        "min_child_samples": 27,
        "num_leaves": 129,
        "random_state": 42,
        "silent": True,
    }
).fit(X_train, y_train)

print("lightgbm_optuned", mean_absolute_percentage_error(y_valid, lightgbm_optuned.predict(X_valid)))
submit(test, lightgbm_optuned, "lightgbm_optuned")

lightgbm_optuned 0.1562352982059385


In [12]:
lightgbm_optuned_1899 = LGBMRegressor(
    **{
        'learning_rate': 0.2200394016092361, 
        'lambda_l1': 3.6405456215002115e-08, 
        'lambda_l2': 3.9256724979441087, 
        'num_leaves': 251, 
        'feature_fraction': 0.7849386830734889, 
        'bagging_fraction': 0.999471799816821, 
        'bagging_freq': 7, 
        'min_child_samples': 5, 
        "random_state": 42,
        "silent": True
    }
).fit(X_train, np.log(y_train))

print("lightgbm_optuned_1899_log", mean_absolute_percentage_error(y_valid, np.exp(lightgbm_optuned_1899.predict(X_valid))))
submit_log(test, lightgbm_optuned_1899, "lightgbm_optuned_log")

lightgbm_optuned_1899_log 0.1266083430966481


### KNN

[notebook with related experiments](model_KNN_optuna.ipynb)

In [9]:
def objective(trial):

    param = {
        'n_neighbors': trial.suggest_int("n_neighbors", 1, 30),
        'leaf_size': trial.suggest_int("leaf_size", 1, 50),
        'p': trial.suggest_int("p", 1, 2),
        'weights': trial.suggest_categorical("weights", ['uniform', 'distance']),
        'metric': trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    }

    knn = KNeighborsRegressor(**param)
    cv_roc_auc = cross_val_score(knn, X_train, y_train, cv=3, scoring="neg_mean_absolute_percentage_error", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///KNNRegressor.db",
    study_name="KNNRegressor",
    load_if_exists=True,
)
study.optimize(objective, n_trials=2)

[32m[I 2022-04-13 18:13:06,186][0m Using an existing study with name 'KNNRegressor' instead of creating a new one.[0m
[32m[I 2022-04-13 18:23:22,123][0m Trial 47 finished with value: -0.1642623637534808 and parameters: {'n_neighbors': 7, 'leaf_size': 38, 'p': 2, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 44 with value: -0.15464020043822252.[0m
[32m[I 2022-04-13 18:33:32,685][0m Trial 48 finished with value: -0.1582071567694789 and parameters: {'n_neighbors': 8, 'leaf_size': 45, 'p': 2, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 44 with value: -0.15464020043822252.[0m


In [10]:
study.best_params

{'leaf_size': 41,
 'metric': 'manhattan',
 'n_neighbors': 8,
 'p': 2,
 'weights': 'distance'}

In [11]:
knn_optuned_32 = KNeighborsRegressor(
    **{
        'n_neighbors': 9, 
        'leaf_size': 41, 
        'p': 1, 
        'weights': 'distance', 
        'metric': 'manhattan'
    }
).fit(X_train, np.log(y_train))

print("knn_optuned_32", mean_absolute_percentage_error(y_valid, np.exp(knn_optuned_32.predict(X_valid))))
submit_log(test, knn_optuned_32, "knn_optuned_32")

knn_optuned_32 0.14258679506496774


### RandomForest

[notebook with relative experiments](model_RandomForest_optuna.ipynb)

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 14),
        'max_samples': trial.suggest_uniform('max_samples', 0.6, 0.99),
        'max_features': trial.suggest_categorical("max_features", ["auto", "sqrt", 'log2']),
        'max_depth': None,
        'bootstrap': True,
        'random_state': 42
    }

    rfr_o = RandomForestRegressor(**param)
    cv_roc_auc = cross_val_score(rfr_o, X_train, y_train, cv=3, scoring="neg_mean_absolute_percentage_error", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///RFRRegressor.db",
    study_name="RFRRegressor",
    load_if_exists=True,
)
study.optimize(objective, n_trials=200)

[32m[I 2022-04-13 18:46:09,425][0m Using an existing study with name 'RFRRegressor' instead of creating a new one.[0m
[32m[I 2022-04-13 18:46:22,263][0m Trial 7 finished with value: -0.2503358825689884 and parameters: {'n_estimators': 209, 'min_samples_split': 12, 'min_samples_leaf': 7, 'max_samples': 0.6170731668968928, 'max_features': 'log2'}. Best is trial 3 with value: -0.15965928060336795.[0m
[32m[I 2022-04-13 18:46:41,167][0m Trial 8 finished with value: -0.20956902989733972 and parameters: {'n_estimators': 239, 'min_samples_split': 8, 'min_samples_leaf': 11, 'max_samples': 0.6960737412613612, 'max_features': 'sqrt'}. Best is trial 3 with value: -0.15965928060336795.[0m
[32m[I 2022-04-13 18:47:20,333][0m Trial 9 finished with value: -0.18754078091628576 and parameters: {'n_estimators': 472, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_samples': 0.9109759310947507, 'max_features': 'log2'}. Best is trial 3 with value: -0.15965928060336795.[0m
[32m[I 2022-04-13 1

#### tuned by hand

In [25]:
rf_tuned = RandomForestRegressor(
    random_state=42,
    n_estimators=800,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=None,
    bootstrap=True
).fit(X_train, np.log(y_train))

In [26]:
print("rf_tuned_log", mean_absolute_percentage_error(y_valid, np.exp(rf_tuned.predict(X_valid))))

rf_tuned_log 0.1315354887566232


### Extra Tree Regressor

[notebook with related experiments](model_ExtraTrees_optuna.ipynb)

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 14),
        'max_samples': trial.suggest_uniform('max_samples', 0.6, 0.99),
        'max_features': trial.suggest_categorical("max_features", ["auto", "sqrt", 'log2']),
        'max_depth': None,
        'bootstrap': True,
        'random_state': 42
    }

    etr_o = ExtraTreesRegressor(**param)
    cv_roc_auc = cross_val_score(etr_o, X_train, y_train, cv=3, scoring="neg_mean_absolute_percentage_error", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///ETRRegressor.db",
    study_name="ETRRegressor",
    load_if_exists=True,
)
study.optimize(objective, n_trials=200)

In [34]:
etr_cust = ExtraTreesRegressor(
    n_estimators=800,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    max_depth=15,
    bootstrap=True,
    random_state=42, 
    n_jobs=-1,
    verbose=1
).fit(X_train, np.log(y_train))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  5.1min finished


In [35]:
print("etr_cust_log", mean_absolute_percentage_error(y_valid, np.exp(etr_cust.predict(X_valid))))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s


etr_cust_log 0.1376278076519815


[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:    1.8s finished


### XGBoost

[notebook with related experiments](2022-04-12_experiments-sklearn.ipynb)

#### manual

In [30]:
xbgr_custom = xgb.XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=0.5,
    learning_rate=0.05,
    max_depth=12,
    alpha=1,
    n_estimators=1000,
    random_state=42,
    n_jobs=-1
).fit(X_train, np.log(y_train))

In [31]:
print("xbgr_custom_log", mean_absolute_percentage_error(y_valid, np.exp(xbgr_custom.predict(X_valid))))

xbgr_custom_log 0.1196754346388126


In [32]:
submit_log(test, xbgr_custom, "xbgr_custom_log")

#### tuning

In [8]:
def objective(trial):

    param = {
        "verbosity": 0,
        "gpu_id": 0,
        "tree_method": "gpu_hist",
        "random_state": 42,
        "silent": True,
        "n_jobs": -1,
        "objective": trial.suggest_categorical("objective", ["reg:linear", "reg:squaredlogerror", "reg:squarederror"]),
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
    }

    if param["booster"] in ["gbtree", "dart"]:
        param["max_depth"] = trial.suggest_int("max_depth", 3, 15)
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.XGBRegressor(**param)
    cv_roc_auc = cross_val_score(bst, X_train, np.log(y_train), cv=5, scoring="neg_mean_absolute_percentage_error", n_jobs=-1)

    return np.mean(cv_roc_auc)


study = optuna.create_study(
    direction="maximize",
    storage="sqlite:///XGBRegressor.db",
    study_name="XGBRegressor",
    load_if_exists=True,
)
study.optimize(objective, timeout=300000, n_trials=0)

[32m[I 2022-04-17 10:00:54,618][0m Using an existing study with name 'XGBRegressor' instead of creating a new one.[0m


In [9]:
study.trials_dataframe().sort_values(["value"], ascending=False).head(5).T

Unnamed: 0,132,139,134,138,135
number,132,139,134,138,135
value,-0.009002,-0.009009,-0.00901,-0.00901,-0.009012
datetime_start,2022-04-16 12:05:04.824973,2022-04-16 13:10:35.521157,2022-04-16 12:22:37.886102,2022-04-16 13:00:47.757864,2022-04-16 12:32:32.251526
datetime_complete,2022-04-16 12:14:01.571815,2022-04-16 13:21:37.360942,2022-04-16 12:32:32.163352,2022-04-16 13:10:35.454031,2022-04-16 12:41:59.566287
duration,0 days 00:08:56.746842,0 days 00:11:01.839785,0 days 00:09:54.277250,0 days 00:09:47.696167,0 days 00:09:27.314761
params_alpha,0.039254,0.044903,0.040829,0.051765,0.038873
params_booster,gbtree,gbtree,gbtree,gbtree,gbtree
params_colsample_bytree,0.260759,0.278286,0.226234,0.221294,0.225607
params_eta,0.0,0.0,0.0,0.0,0.0
params_gamma,0.001405,0.00048,0.000689,0.000608,0.002135


#### xgb_top_5_trials_2022-04-17_02:33

- 132, MAPE: 11.966969441437784, kaggle: 11.96653
- 139, MAPE: 11.97832901549609,  kaggle: 11.74262
- 134, MAPE: 11.985533638729201, kaggle: 11.95685
- 138, MAPE: 11.966599705599973, kaggle: 11.96863
- 135, MAPE: 11.981782802335061, kaggle: 11.96509

In [41]:
for p in (
    study.trials_dataframe().sort_values(["value"], ascending=False).head(5).number
):
    print(p, "\n", study.trials[p].params)

    xgboost_log = xgb.XGBRegressor(
        verbosity=0,
        gpu_id=0,
        tree_method="gpu_hist",
        random_state=42,
        silent=True,
        n_jobs=-1,
        **study.trials[p].params,
    ).fit(X_train, np.log(y_train))

    print(
        f"xgb_v2_trial_{p}",
        mean_absolute_percentage_error(y_valid, np.exp(xgboost_log.predict(X_valid))),
        end="\n\n",
    )

    submit_log(test, xgboost_log, f"xgb_v2_trial_{p}")

132 
 {'alpha': 0.03925389731467428, 'booster': 'gbtree', 'colsample_bytree': 0.26075928432491174, 'eta': 3.3548685374081315e-08, 'gamma': 0.001404801600815511, 'grow_policy': 'depthwise', 'lambda': 1.0060252873701642e-07, 'learning_rate': 0.024232938602382983, 'max_depth': 15, 'min_child_weight': 7, 'n_estimators': 690, 'objective': 'reg:squarederror', 'subsample': 0.7188003743982521}
xgb_v2_trial_132 0.11966969441437784

139 
 {'alpha': 0.044903341303693216, 'booster': 'gbtree', 'colsample_bytree': 0.2782856821187278, 'eta': 3.4353303842042365e-08, 'gamma': 0.00048033429580361897, 'grow_policy': 'depthwise', 'lambda': 4.940599898474283e-07, 'learning_rate': 0.029621998365714833, 'max_depth': 15, 'min_child_weight': 6, 'n_estimators': 736, 'objective': 'reg:squarederror', 'subsample': 0.7203114713416401}
xgb_v2_trial_139 0.1197832901549609

134 
 {'alpha': 0.04082934708230299, 'booster': 'gbtree', 'colsample_bytree': 0.22623415245800638, 'eta': 1.0664131679536447e-07, 'gamma': 0.00068

## Ensemble models

In [20]:
estimators = (
    ("lr", lr),
    ("knn", knn),
    ("lightgbm", lightgbm),
    ("catboost", catboost),
    ("rf", rf),
)

meta = StackingRegressor(estimators=estimators, final_estimator=CatBoostRegressor(), n_jobs=-1)
meta.fit(X_train, y_train)

print("meta", mean_absolute_percentage_error(y_valid, meta.predict(X_valid)))
submit(test, meta, "meta")

Learning rate set to 0.082841
0:	learn: 1707761.0355068	total: 7.34ms	remaining: 7.34s
1:	learn: 1591308.9956367	total: 18.8ms	remaining: 9.37s
2:	learn: 1485016.6045297	total: 27.9ms	remaining: 9.26s
3:	learn: 1389132.0663577	total: 33.5ms	remaining: 8.34s
4:	learn: 1301083.1721720	total: 39.4ms	remaining: 7.85s
5:	learn: 1222258.3138665	total: 49.4ms	remaining: 8.19s
6:	learn: 1151623.2575788	total: 60.6ms	remaining: 8.59s
7:	learn: 1087972.9192920	total: 68ms	remaining: 8.43s
8:	learn: 1031155.5343144	total: 75.5ms	remaining: 8.32s
9:	learn: 979258.6823138	total: 87.1ms	remaining: 8.62s
10:	learn: 932809.8153230	total: 97.4ms	remaining: 8.75s
11:	learn: 891337.1185731	total: 104ms	remaining: 8.53s
12:	learn: 854549.2845555	total: 115ms	remaining: 8.75s
13:	learn: 821883.9703170	total: 125ms	remaining: 8.78s
14:	learn: 792350.4434531	total: 131ms	remaining: 8.58s
15:	learn: 766710.8522203	total: 137ms	remaining: 8.4s
16:	learn: 744290.4031460	total: 147ms	remaining: 8.52s
17:	learn: 

[next tile - models with pre-tuned parameters and ensembles](2022-04-15_ensemble.ipynb)