In [1]:
### A great library, but unfortunately only compatible with Linux.
# import ydf

import xgboost as xgb
import pandas as pd
import optuna
import optunahub
import seaborn as sns
from sklearn.model_selection import train_test_split # To divide data into training and validation sets
from sklearn.metrics import mean_squared_error # To evaluate the model
from sklearn.model_selection import KFold # For dividing data into training and validation sets using the Kfold method
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_ds_full = pd.read_csv("../future_engenerring/recorded_analyzed_data/Normalne_dane_treningowe.csv")

test_ds = pd.read_csv("../future_engenerring/recorded_analyzed_data/Test_ZimputowaneDane_MetodaLGB_zakodowaneWartościTekstowe.csv")

# Manually filter columns by type because for some reason the search function is not working properly.
object_cols = []

for col in train_ds_full:
    if train_ds_full[col].dtype.name != 'float64' and train_ds_full[col].dtype.name != 'int64':
        object_cols.append(col)

# We need to convert columns containing categorical values.
# This is a necessary step in order to automate the encoding of these values, which are used in the XGBoost function.
for col in object_cols:
    test_ds[col] = test_ds[col].astype('category')
    train_ds_full[col] = train_ds_full[col].astype('category')

train_ds_full = train_ds_full.drop(columns=['Unnamed: 0'])
test_ds = test_ds.drop(columns=['Unnamed: 0'])

# The MSSubClass column contains numeric values but is described in the document as a categorical attribute, so we need to change its data type.
train_ds_full['MSSubClass'] = train_ds_full['MSSubClass'].astype('category')
test_ds['MSSubClass'] = test_ds['MSSubClass'].astype('category')

object_cols.clear()

for col in test_ds:
    if test_ds[col].dtype.name == 'object':
        object_cols.append(col)

print(object_cols)

for col in object_cols:
    print(col)
    test_ds[col] = pd.to_numeric(test_ds[col], errors='coerce').astype('float64')

print(train_ds_full.shape)
print(test_ds.shape)

['BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea', 'count_of_porch', 'sum_of_area_porch']
BsmtFullBath
BsmtHalfBath
GarageCars
GarageArea
count_of_porch
sum_of_area_porch
(1445, 65)
(1459, 64)


# Using K-Fold instead of StratifiedKFold
I used the standard k-fold splitting method instead of stratified k-fold for two reasons:
1. StratifiedKFold can only be used on classification data, not regression data.
2. Direct use of StratifiedKFold was not possible because the model solves a regression problem (predicting a continuous price) rather than a classification problem. Artificially dividing prices into intervals (binning) to enforce stratification would introduce unnecessary complexity, so standard KFold with random shuffling was chosen.

# Data breakdown for optiuna
The data must be divided so that Optiuna can compare its analyses.

# K-Fold data brakedown
I used a single division using the K-Fold Cross-Validation method (5 divisions). This ensures that each record in the dataset is used exactly once as a test set and four times as a training set. This eliminates the problem of ‘unlucky sampling’, where difficult cases would be omitted from the learning or validation process.

In [3]:
# K-fold splitting
kf = KFold(n_splits=5, shuffle=True)

# Things you need to do for Optiuna to work
1. You need to define the “objective” function with the “trial” argument.
2. Create a dictionary of hyperparameters and use the “trial” function to set the ranges of numbers that will be inserted into these hyperparameters.
3. This function must return a metric according to which the hyperparameters will be changed.

In [4]:
def objective(trial):
    # Suggesting hyperparameter values for Optuna to test
    params_to_learn = {
        # "max_depth": trial.suggest_int("max_depth", 4, 13),
        "n_estimators": trial.suggest_int("n_estimators", 300, 900),
        "max_leaves" : trial.suggest_int("max_leaves", 15, 100),
        "max_bin": trial.suggest_int("max_bin", 10, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.5, log=True),
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "reg_alpha": trial.suggest_float("reg_alpha", 1, 5.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1, 5.0, log=True),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10)
    }

    model = xgb.XGBRegressor(
        enable_categorical = True,
        **params_to_learn
    )

    table_of_mean_kfold_predictions = []
    
    for i, (train_index, validation_index) in enumerate(kf.split(train_ds_full)):

        train_ds_subset = train_ds_full.iloc[train_index]
        # This value is required by the XGBoost regression function to identify the target variable.
        train_ds_subset_saleprice = train_ds_subset["SalePrice"]

        validation_ds_subset = train_ds_full.iloc[validation_index]
        validation_ds_subset_saleprice = validation_ds_subset["SalePrice"]

        # Training the model.
        model.fit(train_ds_subset, train_ds_subset_saleprice)

        # Predicts values on the validation set
        predictions = model.predict(validation_ds_subset)
        
        # Calculates the error (RMSE - Root Mean Squared Error) that we want to minimize.
        rmse = np.sqrt(mean_squared_error(validation_ds_subset_saleprice, predictions))
        
        table_of_mean_kfold_predictions.append(rmse)
        print(np.mean(table_of_mean_kfold_predictions))

    return np.mean(table_of_mean_kfold_predictions)

In [5]:
# We want to minimize the error, so direction='minimize'.
study = optuna.create_study(storage="sqlite:///db.sqlite3",
                            study_name="home-price-prediction2",
                            direction='minimize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

print("--------------------------------------")
print("Optimization completed.")
print(f"Najlepsza próba: {study.best_trial.number}")
print(f"Najlepszy wynik (RMSE): {study.best_value}")
print("Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# Train the final model with the best parameters found on the FULL training set.
print("--------------------------------------")
print("Training the final model with the best parameters...")
final_model = xgb.XGBRegressor(
    enable_categorical = True,
    **study.best_params
)

final_model = final_model.fit(train_ds_full.drop(columns=['SalePrice']), train_ds_full['SalePrice'])

[I 2026-01-12 14:27:02,258] A new study created in RDB with name: home-price-prediction
  0%|                                                                                                                                            | 0/30 [00:39<?, ?it/s]


[W 2026-01-12 14:27:41,825] Trial 0 failed with parameters: {'n_estimators': 538, 'max_leaves': 77, 'max_bin': 186, 'learning_rate': 0.12727204765027955, 'booster': 'dart', 'reg_alpha': 1.4654238245894475, 'reg_lambda': 1.8572699677181645, 'gamma': 2.586658197952358, 'min_child_weight': 9} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/demo/.local/share/pipx/venvs/jupyterlab/lib/python3.13/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_23733/2990506710.py", line 33, in objective
    model.fit(train_ds_subset, train_ds_subset_saleprice)
    ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/demo/.local/share/pipx/venvs/jupyterlab/lib/python3.13/site-packages/xgboost/core.py", line 774, in inner_f
    return func(**kwargs)
  File "/home/demo/.local/share/pipx/venvs/jupyterlab/lib/python3.13/site-packages/xgboost/sklearn.py", line 1370, in fit

KeyboardInterrupt: 

In [None]:
# Plot feature importance
xgb.plot_importance(final_model, max_num_features=10)

In [None]:
# Perform predictions on the test set and save the results.
print("Generating forecasts on the test set...")
prognozy = final_model.predict(test_ds)
print(prognozy)
prognozy = np.expm1(prognozy)

lista = []
for i in prognozy:
    lista.append(i)

lista_ids = []
for i in range(1461,2920):
    lista_ids.append(i)

df = pd.DataFrame({'Id': lista_ids, 'SalePrice': lista})
df.to_csv('../data_returned_by_model/out_optuna.csv', index=False)

print("The results were saved to the file out_optuna.csv")

In [None]:
# Terminal command for Optiuna to read the completed hyperparameter analysis. You must specify the main path to the file.
# optuna-dashboard sqlite:///"C:\Users\Joint\Documents\Moje projekty\Przewidywanie cen domów\db.sqlite3"