In [55]:
import pandas as pd
import xgboost as xgb
import optuna

In [56]:
train = pd.read_csv("Datasets/train_le_ohe.csv")
test = pd.read_csv("Datasets/test_le_ohe.csv")
ids = pd.read_csv("Datasets/test.csv")["Id"]

In [57]:
X_train = train.iloc[:,:-1]
X_test = test
y=train["SalePrice"]

In [61]:
def objective(trial):
    cv_params = {
        "early_stopping_rounds": 10,
        "nfold": 5,
        "metrics": 'rmsle',
        "num_boost_round": trial.suggest_int('num_boost_round', 500, 2000),
    }
    model_params = {
        "eval_metric": 'rmsle',
        "eta": trial.suggest_float('eta', 0.01, 0.2),
        "max_depth": trial.suggest_int('max_depth', 2, 16),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        "n_jobs": 4,
        'tree_method': 'gpu_hist'
    }
    data_dmatrix = xgb.DMatrix(data=X_train, label=y)
    xgb_cv = xgb.cv(dtrain=data_dmatrix, params=model_params, **cv_params)
    return xgb_cv['test-rmsle-mean'].iloc[-1]

In [62]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=69)

[I 2023-08-01 11:24:28,514] A new study created in memory with name: no-name-c1debd25-2218-48f5-b1ac-c9ed2c939aa7
[I 2023-08-01 11:24:32,995] Trial 0 finished with value: 0.12854094421855572 and parameters: {'num_boost_round': 536, 'eta': 0.07052987336465426, 'max_depth': 9, 'lambda': 6.621830866103717, 'alpha': 9.67040590338482, 'colsample_bytree': 0.6, 'subsample': 1.0, 'min_child_weight': 24}. Best is trial 0 with value: 0.12854094421855572.
[I 2023-08-01 11:24:34,319] Trial 1 finished with value: 0.2051309092230536 and parameters: {'num_boost_round': 1423, 'eta': 0.1642450989426877, 'max_depth': 16, 'lambda': 4.827904231328236, 'alpha': 4.912328474993059, 'colsample_bytree': 0.7, 'subsample': 0.4, 'min_child_weight': 190}. Best is trial 0 with value: 0.12854094421855572.
[I 2023-08-01 11:24:39,016] Trial 2 finished with value: 0.1484022202791415 and parameters: {'num_boost_round': 535, 'eta': 0.07289881388137831, 'max_depth': 16, 'lambda': 9.001160018309543, 'alpha': 8.531722570686

In [63]:
model = xgb.XGBRegressor(
    **study.best_params,
    n_estimators=study.best_params['num_boost_round'],
    verbosity=0,
    tree_method="gpu_hist"
)
model.fit(X_train, y)

In [64]:
prediction = model.predict(X_test)
submission = pd.DataFrame({
    'id': ids,
    'SalePrice': prediction
})
submission.to_csv('Submissions/xgboost_optuna_fe_le_ohe.csv', index=False)
print('Successfully made a prediction!')

Successfully made a prediction!
