# This notebook is for the 30 Days of Machine Learning contest on Kaggle.

#### This notebook was created by following Abhishek Thakur's YouTube video tutorial: 
https://www.youtube.com/watch?v=m5YSKPMjkrk&list=PL98nY_tJQXZnP-k3qCDd1hljVSciDV9_N&index=22


In [3]:
# Imports
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [4]:
df = pd.read_csv("data/train_folds.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain = df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
            
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]


In [6]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=42,
        tree_method='gpu_hist',
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [7]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=5) #100

[32m[I 2021-08-28 14:05:24,980][0m A new study created in memory with name: no-name-965b8e91-f246-4507-ae43-81be23fbe0ff[0m


[0]	validation_0-rmse:7.31282
[1000]	validation_0-rmse:0.72623
[2000]	validation_0-rmse:0.72307
[3000]	validation_0-rmse:0.72176
[4000]	validation_0-rmse:0.72097
[5000]	validation_0-rmse:0.72049
[6000]	validation_0-rmse:0.72010
[6999]	validation_0-rmse:0.71977


[32m[I 2021-08-28 14:06:25,501][0m Trial 0 finished with value: 0.71976837039695 and parameters: {'learning_rate': 0.06064204370094016, 'reg_lambda': 0.0008943762844024488, 'reg_alpha': 73.40222297361952, 'subsample': 0.31260504251715354, 'colsample_bytree': 0.11793184041891933, 'max_depth': 2}. Best is trial 0 with value: 0.71976837039695.[0m


[0]	validation_0-rmse:7.70119
[1000]	validation_0-rmse:0.73402
[2000]	validation_0-rmse:0.73033
[3000]	validation_0-rmse:0.72806
[4000]	validation_0-rmse:0.72641
[5000]	validation_0-rmse:0.72504
[6000]	validation_0-rmse:0.72396
[6999]	validation_0-rmse:0.72310


[32m[I 2021-08-28 14:07:34,049][0m Trial 1 finished with value: 0.7230953815507253 and parameters: {'learning_rate': 0.010208801417660018, 'reg_lambda': 17.700862581841477, 'reg_alpha': 9.521414469945798e-05, 'subsample': 0.5816500232152734, 'colsample_bytree': 0.4712752973765638, 'max_depth': 2}. Best is trial 0 with value: 0.71976837039695.[0m


[0]	validation_0-rmse:7.41992
[1000]	validation_0-rmse:0.72031
[2000]	validation_0-rmse:0.71884
[2688]	validation_0-rmse:0.71887


[32m[I 2021-08-28 14:08:09,564][0m Trial 2 finished with value: 0.7187456855336856 and parameters: {'learning_rate': 0.0468266829587717, 'reg_lambda': 0.6404835599499219, 'reg_alpha': 8.39014824241283e-05, 'subsample': 0.8799182498337184, 'colsample_bytree': 0.3603951177342468, 'max_depth': 4}. Best is trial 2 with value: 0.7187456855336856.[0m


[0]	validation_0-rmse:7.48136
[1000]	validation_0-rmse:0.73174
[2000]	validation_0-rmse:0.72923
[3000]	validation_0-rmse:0.72769
[4000]	validation_0-rmse:0.72658
[5000]	validation_0-rmse:0.72576
[6000]	validation_0-rmse:0.72502
[6999]	validation_0-rmse:0.72445


[32m[I 2021-08-28 14:08:57,222][0m Trial 3 finished with value: 0.7244518697713013 and parameters: {'learning_rate': 0.0387481252564759, 'reg_lambda': 2.491613799395933, 'reg_alpha': 0.008247439107796408, 'subsample': 0.3392530519002951, 'colsample_bytree': 0.42545327455893456, 'max_depth': 1}. Best is trial 2 with value: 0.7187456855336856.[0m


[0]	validation_0-rmse:7.21481
[1000]	validation_0-rmse:0.71975
[1562]	validation_0-rmse:0.71976


[32m[I 2021-08-28 14:09:17,428][0m Trial 4 finished with value: 0.7195197455715279 and parameters: {'learning_rate': 0.07352989392680517, 'reg_lambda': 0.8704018173031131, 'reg_alpha': 7.781585497086379e-08, 'subsample': 0.6514338296346952, 'colsample_bytree': 0.5758849457205628, 'max_depth': 4}. Best is trial 2 with value: 0.7187456855336856.[0m


In [8]:
study.best_params

{'learning_rate': 0.0468266829587717,
 'reg_lambda': 0.6404835599499219,
 'reg_alpha': 8.39014824241283e-05,
 'subsample': 0.8799182498337184,
 'colsample_bytree': 0.3603951177342468,
 'max_depth': 4}