In [1]:
pip install xgboost scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [6]:

# Import necessary libraries
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the California housing dataset
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost regressor model with all hyperparameters
xg_reg = xgb.XGBRegressor(
    objective='reg:squarederror',  # Regression with squared loss
    colsample_bytree=0.7,          # Subsample ratio of columns when constructing each tree
    learning_rate=0.1,             # Step size shrinkage to prevent overfitting
    max_depth=5,                   # Maximum depth of a tree
    alpha=10,                      # L1 regularization term on weights
    subsample=0.8,                 # Subsample ratio of the training instance
    gamma=0.1,                     # Minimum loss reduction required to make a further partition on a leaf node
    min_child_weight=5,            # Minimum sum of instance weight (hessian) needed in a child
    n_estimators=100,              # Number of boosting rounds
    booster='gbtree',              # Specify which booster to use: gbtree, gblinear, or dart
    tree_method='hist',device="cuda",            # Tree construction algorithm used in XGBoost
    n_jobs=-1,                     # Number of parallel threads used to run XGBoost
    random_state=42                # Random number seed
)

# Train the model on the training data
xg_reg.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xg_reg.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")


Mean Squared Error: 0.25


# With Grid Search cv


In [1]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import time

# Load the California housing dataset
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost regressor model
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', n_jobs=-1, random_state=42)  # Regression with squared loss

# Define a smaller hyperparameter grid
param_grid = {
    'colsample_bytree': [0.3, 0.7],  # Subsample ratio of columns when constructing each tree
    'learning_rate': [0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'max_depth': [3, 5],  # Maximum depth of a tree
    'alpha': [1, 10],  # L1 regularization term on weights
    'lambda': [1, 10],  # L2 regularization term on weights
    'subsample': [0.8, 1.0],  # Subsample ratio of the training instance
    'gamma': [0, 0.1],  # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': [1, 5],  # Minimum sum of instance weight (hessian) needed in a child
    'n_estimators': [100, 200],  # Number of boosting rounds
    'booster': ['gbtree'],  # Specify which booster to use: gbtree, gblinear, or dart
}

# Calculate the total number of iterations
total_iterations = np.prod([len(param_grid[key]) for key in param_grid.keys()]) * 3  # Number of parameter combinations * 3 (cv folds)

# Setup the GridSearchCV
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=2, verbose=1, n_jobs=-1)

# Track the start time
start_time = time.time()

# Fit the model using grid search
grid_search.fit(X_train, y_train)

# Calculate the total time taken
total_time = time.time() - start_time
average_time_per_iteration = total_time / total_iterations

# Print the time estimation information
print(f"Total time taken: {total_time:.2f} seconds")
print(f"Average time per iteration: {average_time_per_iteration:.2f} seconds")
print(f"Estimated remaining time per iteration during execution.")

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# Make predictions on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")


Fitting 2 folds for each of 512 candidates, totalling 1024 fits
Total time taken: 386.36 seconds
Average time per iteration: 0.25 seconds
Estimated remaining time per iteration during execution.
Best parameters found: {'alpha': 1, 'booster': 'gbtree', 'colsample_bytree': 0.7, 'gamma': 0, 'lambda': 10, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
Mean Squared Error: 0.20


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




# using hyperopt

In [2]:
pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 812.2 kB/s eta 0:00:00
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
     -------------------------------------- 200.5/200.5 kB 6.1 MB/s eta 0:00:00
Installing collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7
Note: you may need to restart the kernel to use updated packages.


In [14]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import time

# Load the California housing dataset
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for hyperopt
def objective(params):
    # Set the parameters that are not optimized
    params['objective'] = 'reg:squarederror'
    params['tree_method'] = 'hist'
    params['device'] = 'cuda'
    params['n_jobs'] = -1
    params['random_state'] = 42

    # Create and train the model
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)

    # Make predictions and calculate the mean squared error
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    # Return the loss (MSE) and status
    return {'loss': mse, 'status': STATUS_OK}

# Define the hyperparameter space
param_space = {
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 0.7),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'max_depth': hp.choice('max_depth', [3, 5, 7]),
    'alpha': hp.choice('alpha', [1, 10, 100]),
    'lambda': hp.choice('lambda', [1, 10, 100]),
    'subsample': hp.uniform('subsample', 0.8, 1.0),
    'gamma': hp.uniform('gamma', 0, 0.2),
    'min_child_weight': hp.choice('min_child_weight', [1, 5, 10]),
    'n_estimators': hp.choice('n_estimators', [50, 100, 200]),
    'booster': hp.choice('booster', ['gbtree', 'gblinear', 'dart'])
}

# Run the hyperparameter optimization using Hyperopt
trials = Trials()
start_time = time.time()
best_params = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=100, trials=trials)
end_time = time.time()

# Print the best parameters and the total time taken
print(f"Best parameters: {best_params}")
print(f"Total time taken: {end_time - start_time:.2f} seconds")

# Convert hyperopt results to the corresponding values
best_params['max_depth'] = [3, 5, 7][best_params['max_depth']]
best_params['alpha'] = [1, 10, 100][best_params['alpha']]
best_params['lambda'] = [1, 10, 100][best_params['lambda']]
best_params['min_child_weight'] = [1, 5, 10][best_params['min_child_weight']]
best_params['n_estimators'] = [50, 100, 200][best_params['n_estimators']]
best_params['booster'] = ['gbtree', 'gblinear', 'dart'][best_params['booster']]

# Train the model with the best hyperparameters
final_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', n_jobs=-1, random_state=42, **best_params)
final_model.fit(X_train, y_train)

# Make predictions and calculate the mean squared error on the test set
y_pred = final_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with best parameters: {mse:.2f}")


  9%|▉         | 9/100 [00:06<00:37,  2.45trial/s, best loss: 0.2099090195688917]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 14%|█▍        | 14/100 [00:07<00:19,  4.53trial/s, best loss: 0.20709528372344008]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 20%|██        | 20/100 [00:15<01:45,  1.32s/trial, best loss: 0.20709528372344008]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 21%|██        | 21/100 [00:21<03:23,  2.57s/trial, best loss: 0.1956644517966557] 

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 33%|███▎      | 33/100 [01:15<04:35,  4.12s/trial, best loss: 0.19527053312735057]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 37%|███▋      | 37/100 [01:33<04:16,  4.08s/trial, best loss: 0.19527053312735057]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 41%|████      | 41/100 [01:46<03:41,  3.75s/trial, best loss: 0.19527053312735057]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 46%|████▌     | 46/100 [01:54<01:37,  1.80s/trial, best loss: 0.19527053312735057]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 51%|█████     | 51/100 [01:57<00:34,  1.43trial/s, best loss: 0.19527053312735057]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 55%|█████▌    | 55/100 [02:09<01:40,  2.23s/trial, best loss: 0.19527053312735057]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 61%|██████    | 61/100 [02:19<00:58,  1.51s/trial, best loss: 0.19527053312735057]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 77%|███████▋  | 77/100 [03:34<01:33,  4.05s/trial, best loss: 0.19409336216737874]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 80%|████████  | 80/100 [03:46<01:11,  3.56s/trial, best loss: 0.19409336216737874]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 86%|████████▌ | 86/100 [04:05<00:59,  4.25s/trial, best loss: 0.19409336216737874]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 92%|█████████▏| 92/100 [04:17<00:17,  2.16s/trial, best loss: 0.19409336216737874]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




 98%|█████████▊| 98/100 [04:36<00:04,  2.46s/trial, best loss: 0.19409336216737874]

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample", "tree_method" } are not used.




100%|██████████| 100/100 [04:41<00:00,  2.82s/trial, best loss: 0.19409336216737874]
Best parameters: {'alpha': 0, 'booster': 2, 'colsample_bytree': 0.6778712864572316, 'gamma': 0.14787863172474586, 'lambda': 0, 'learning_rate': 0.11614975739858949, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 2, 'subsample': 0.8045746149447889}
Total time taken: 281.52 seconds
Mean Squared Error with best parameters: 0.19
