In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import math
from xgboost import XGBRegressor
import optuna
from joblib import dump

In [3]:
df = pd.read_csv("../data/cleaned_data.csv")

In [7]:
X = df.drop(columns = 'price')
y = df['price']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=105)

In [14]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)


In [15]:
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

[I 2024-02-07 14:50:35,294] A new study created in memory with name: regression
[I 2024-02-07 14:50:39,742] Trial 0 finished with value: 26297096.697333615 and parameters: {'max_depth': 2, 'learning_rate': 0.8915509810069852, 'n_estimators': 891, 'min_child_weight': 5, 'gamma': 0.41444798749191897, 'subsample': 0.25488313007686436, 'colsample_bytree': 0.2543929441834099, 'reg_alpha': 0.8717651052201966, 'reg_lambda': 0.7670335682190615, 'random_state': 457}. Best is trial 0 with value: 26297096.697333615.
[I 2024-02-07 14:50:45,008] Trial 1 finished with value: 18927453.83868386 and parameters: {'max_depth': 10, 'learning_rate': 0.6657899455368863, 'n_estimators': 842, 'min_child_weight': 7, 'gamma': 0.8472203998649239, 'subsample': 0.9318926261574589, 'colsample_bytree': 0.6567930293836112, 'reg_alpha': 0.09953476276517552, 'reg_lambda': 0.20435834387646823, 'random_state': 6}. Best is trial 1 with value: 18927453.83868386.
[I 2024-02-07 14:50:48,967] Trial 2 finished with value: 1382

[I 2024-02-07 14:51:49,284] Trial 19 finished with value: 21994810.709575854 and parameters: {'max_depth': 3, 'learning_rate': 0.25940256634249603, 'n_estimators': 101, 'min_child_weight': 4, 'gamma': 0.04135378242254273, 'subsample': 0.8974314197598603, 'colsample_bytree': 0.6202189415319657, 'reg_alpha': 0.6172405304491866, 'reg_lambda': 0.15428553817850277, 'random_state': 150}. Best is trial 12 with value: 13050403.59709807.
[I 2024-02-07 14:51:55,784] Trial 20 finished with value: 24024860.296575632 and parameters: {'max_depth': 7, 'learning_rate': 0.8113167206489557, 'n_estimators': 996, 'min_child_weight': 10, 'gamma': 0.5898023842657636, 'subsample': 0.49067992718806436, 'colsample_bytree': 0.8958130629386687, 'reg_alpha': 0.7679715774548921, 'reg_lambda': 0.3997695281496109, 'random_state': 578}. Best is trial 12 with value: 13050403.59709807.
[I 2024-02-07 14:51:59,469] Trial 21 finished with value: 15587739.58686266 and parameters: {'max_depth': 5, 'learning_rate': 0.3678081

[I 2024-02-07 14:53:17,865] Trial 39 finished with value: 16057583.512650432 and parameters: {'max_depth': 4, 'learning_rate': 0.06376097516603177, 'n_estimators': 896, 'min_child_weight': 3, 'gamma': 0.8878263678414977, 'subsample': 0.9895572980293528, 'colsample_bytree': 0.84190512597994, 'reg_alpha': 0.040845912761340086, 'reg_lambda': 0.7796082222112435, 'random_state': 425}. Best is trial 12 with value: 13050403.59709807.
[I 2024-02-07 14:53:23,266] Trial 40 finished with value: 18830748.685407348 and parameters: {'max_depth': 2, 'learning_rate': 0.17786492348481078, 'n_estimators': 872, 'min_child_weight': 1, 'gamma': 0.4636947479679519, 'subsample': 0.3741797311846057, 'colsample_bytree': 0.6092738840775085, 'reg_alpha': 0.387745321769968, 'reg_lambda': 0.9995637888287835, 'random_state': 287}. Best is trial 12 with value: 13050403.59709807.
[I 2024-02-07 14:53:28,235] Trial 41 finished with value: 14389718.65697673 and parameters: {'max_depth': 7, 'learning_rate': 0.29448031811

[I 2024-02-07 14:54:51,604] Trial 59 finished with value: 16192099.213803079 and parameters: {'max_depth': 5, 'learning_rate': 0.46453454421555085, 'n_estimators': 783, 'min_child_weight': 2, 'gamma': 0.8133290477358465, 'subsample': 0.7667290497118378, 'colsample_bytree': 0.4569678515342848, 'reg_alpha': 0.7439261749968937, 'reg_lambda': 0.8004267474225322, 'random_state': 195}. Best is trial 12 with value: 13050403.59709807.
[I 2024-02-07 14:54:53,147] Trial 60 finished with value: 23753805.379853915 and parameters: {'max_depth': 3, 'learning_rate': 0.9551702082036051, 'n_estimators': 78, 'min_child_weight': 9, 'gamma': 0.5937517012795221, 'subsample': 0.9272826284665181, 'colsample_bytree': 0.8968009116106135, 'reg_alpha': 0.29338483184030095, 'reg_lambda': 0.7113156429647831, 'random_state': 344}. Best is trial 12 with value: 13050403.59709807.
[I 2024-02-07 14:54:58,180] Trial 61 finished with value: 13341917.564816225 and parameters: {'max_depth': 4, 'learning_rate': 0.2623199012

[I 2024-02-07 14:56:17,295] Trial 79 finished with value: 13604410.339857157 and parameters: {'max_depth': 5, 'learning_rate': 0.39606712575103753, 'n_estimators': 685, 'min_child_weight': 3, 'gamma': 0.47031127004075535, 'subsample': 0.810145819832245, 'colsample_bytree': 0.6382096122021768, 'reg_alpha': 0.6313888913351828, 'reg_lambda': 0.9745345726075259, 'random_state': 139}. Best is trial 73 with value: 12459655.914287673.
[I 2024-02-07 14:56:20,307] Trial 80 finished with value: 13772870.23162327 and parameters: {'max_depth': 6, 'learning_rate': 0.4775740146171852, 'n_estimators': 479, 'min_child_weight': 1, 'gamma': 0.5003029023891794, 'subsample': 0.6565601356840725, 'colsample_bytree': 0.6754781213550153, 'reg_alpha': 0.5768879302941892, 'reg_lambda': 0.9989570236411162, 'random_state': 242}. Best is trial 73 with value: 12459655.914287673.
[I 2024-02-07 14:56:23,892] Trial 81 finished with value: 15090347.398357822 and parameters: {'max_depth': 5, 'learning_rate': 0.409080826

[I 2024-02-07 14:57:44,541] Trial 98 finished with value: 16824009.083889972 and parameters: {'max_depth': 4, 'learning_rate': 0.5509449577876592, 'n_estimators': 708, 'min_child_weight': 3, 'gamma': 0.3122411702412798, 'subsample': 0.5941791544335011, 'colsample_bytree': 0.471023561220677, 'reg_alpha': 0.6356790650027843, 'reg_lambda': 0.8847699035344003, 'random_state': 89}. Best is trial 73 with value: 12459655.914287673.
[I 2024-02-07 14:57:48,346] Trial 99 finished with value: 15099042.196851837 and parameters: {'max_depth': 6, 'learning_rate': 0.4243943660368765, 'n_estimators': 544, 'min_child_weight': 1, 'gamma': 0.5615833644070768, 'subsample': 0.8437497344492211, 'colsample_bytree': 0.6809059264984624, 'reg_alpha': 0.6082386875032036, 'reg_lambda': 0.41383800103447105, 'random_state': 228}. Best is trial 73 with value: 12459655.914287673.


In [16]:
# Print the best parameters
print('Best parameters', study.best_params)

Best parameters {'max_depth': 4, 'learning_rate': 0.4023884054905842, 'n_estimators': 602, 'min_child_weight': 1, 'gamma': 0.6844674302270423, 'subsample': 0.8489996803685931, 'colsample_bytree': 0.6775783225413308, 'reg_alpha': 0.6421588199445127, 'reg_lambda': 0.8264437333231882, 'random_state': 523}


In [17]:
model = XGBRegressor(**study.best_params)

In [19]:
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [21]:
y_pred

array([ 8821.459 ,  5719.6724, 32118.826 , ...,  9613.323 ,  2926.7153,
        2254.5186], dtype=float32)

In [22]:
mse = mean_squared_error(y_test, y_pred)

In [23]:
math.sqrt(mse)

3529.823779494902

In [30]:
model

In [25]:
dump(xgb_model, 'xgb_model.joblib')

AttributeError: 'XGBRegressor' object has no attribute 'dump_model'