In [9]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


df = pd.read_csv("./data20241116c.csv")

# Split the data
X = df.drop(columns=['demand','timestamp','Unnamed: 0'])
y = df['demand']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
categorical_features = ['hour_of_day', 'day_of_week','day_of_month','month_of_year']
dtrain = xgb.DMatrix(X_train, label=y_train,enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test,enable_categorical=True)

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error


# Define parameter grid for RandomizedSearch
param_dist = {
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings sampled
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Run RandomizedSearchCV
random_search.fit(X_train, y_train)

# Display best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate on test set
best_model = random_search.best_estimator_
y_test_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Test RMSE: {test_rmse:.2f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'subsample': 0.6, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
Test RMSE: 1.51


In [22]:
from sklearn.model_selection import GridSearchCV

# Define a smaller parameter grid for GridSearchCV
param_grid = {
    'max_depth': [3, 5],
    'min_child_weight': [1, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'learning_rate': [0.1, 0.2],
    'n_estimators': [100, 200]
}

# Set up GridSearchCV
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Run GridSearchCV
grid_search.fit(X_train, y_train)

# Display best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate on test set
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Test RMSE: {test_rmse:.2f}")


Fitting 3 folds for each of 64 candidates, totalling 192 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.8}
Test RMSE: 1.47


In [23]:
import optuna

# Define the objective function
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300)
    }
    
    # Define the XGBoost model
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', **params)
    
    # Train and evaluate using cross-validation
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse

# Set up and run the Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Display the best parameters and score
print("Best Parameters:", study.best_params)
print("Best RMSE:", study.best_value)


[I 2024-11-17 20:24:53,281] A new study created in memory with name: no-name-5371f5e9-a2ed-46d7-bb32-56a6c24099e5
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
[I 2024-11-17 20:24:53,368] Trial 0 finished with value: 1.6652731079460816 and parameters: {'max_depth': 3, 'min_child_weight': 6, 'subsample': 0.7736707893542558, 'colsample_bytree': 0.8163892149081823, 'learning_rate': 0.06080470563035991, 'n_estimators': 220}. Best is trial 0 with value: 1.6652731079460816.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
[I 2024-11-17 20:24:53,513] Trial 1 finished with value: 1.5208791990383654 and parameters: {'max_depth': 5, 'min_child_weight': 9, 'subsample': 0.672443135851356, 'colsample_bytree': 0.8149424850492875, 'learning_rate': 0.021325884379183632, 'n_estimators': 292}. Best is trial 1 with value: 1.5208791990383654.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
[I 2024-11-17 20:24:53,725] Trial 2 finis

Best Parameters: {'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.9972621730657976, 'colsample_bytree': 0.6429678916412152, 'learning_rate': 0.03567325422386057, 'n_estimators': 194}
Best RMSE: 1.3422410983133028
