In [0]:
train_df = spark.table("marketingdata_prod.ds_sandbox.train_df_optuna_yw")
test_df = spark.table("marketingdata_prod.ds_sandbox.test_df_optuna_yw")
train_df.cache()
test_df.cache()
print(train_df.count())
print(test_df.count())

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import collect_list, concat_ws, udf ,lit, col, when, split, size, lower, explode, dayofmonth
from pyspark.sql.types import *
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, when, lit, avg, stddev, month, year, unix_timestamp
#from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from xgboost.spark import SparkXGBRegressor
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.sql import DataFrame
import optuna

In [0]:
import warnings
from xgboost import XGBRegressor

warnings.filterwarnings("ignore", message="Loading a native XGBoost model with Scikit-Learn interface.")

In [0]:
df_optuna = spark.table("marketingdata_prod.ds_sandbox.df_assembled_feature_optuna_yw")
df_optuna.cache()
df_optuna = df_optuna.withColumnRenamed("storedelivery%_label_col", "label")

In [0]:
chunk_df_10 = df_optuna.sample(fraction=0.05, seed=7)
train_df, test_df = chunk_df_10.randomSplit([0.75, 0.25], seed=7)

In [0]:
print(train_df.count())

384861


In [0]:
print(test_df.count())

128124


In [0]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    xgb = SparkXGBRegressor(
        features_col="features", label_col="label", **params
    )

    model = xgb.fit(train_df)
    predictions = model.transform(test_df)

    evaluator_r2 = RegressionEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="r2",
    )

    r2 = evaluator_r2.evaluate(predictions)
    return r2


study = create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters:", study.best_params)
print("Best R²:", study.best_value)

[I 2024-08-09 10:33:00,295] A new study created in memory with name: no-name-f728efc8-18b9-4576-b6c1-fc5b9e97d261
[I 2024-08-09 10:35:34,591] Trial 0 finished with value: 0.39816917597214896 and parameters: {'learning_rate': 0.01448081390889342, 'max_depth': 7, 'subsample': 0.8021223078586418, 'colsample_bytree': 0.09448444520592443, 'min_child_weight': 6}. Best is trial 0 with value: 0.39816917597214896.
[I 2024-08-09 10:37:11,433] Trial 1 finished with value: 0.342033816453192 and parameters: {'learning_rate': 0.0012725128369047815, 'max_depth': 4, 'subsample': 0.5747884197112461, 'colsample_bytree': 0.9838884998623835, 'min_child_weight': 11}. Best is trial 0 with value: 0.39816917597214896.
[I 2024-08-09 10:39:52,019] Trial 2 finished with value: 0.3045530319258961 and parameters: {'learning_rate': 0.0014858279106874006, 'max_depth': 7, 'subsample': 0.7023845805831728, 'colsample_bytree': 0.20587044699771684, 'min_child_weight': 16}. Best is trial 0 with value: 0.39816917597214896.

Best hyperparameters: {'learning_rate': 0.008208194485535055, 'max_depth': 8, 'subsample': 0.8785733038553678, 'colsample_bytree': 0.7346658223239586, 'min_child_weight': 10}
Best R²: 0.41222038469325706


In [0]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 2500,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 7),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 5, 15),
        "alpha": trial.suggest_float("alpha", 0.2, 0.7),
    }

    xgb = SparkXGBRegressor(features_col="features", label_col="label", **params)

    model = xgb.fit(train_df)
    predictions = model.transform(test_df)

    evaluator_r2 = RegressionEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="r2",
    )

    r2 = evaluator_r2.evaluate(predictions)
    return r2

study = create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best hyperparameters:", study.best_params)
print("Best R²:", study.best_value)

[I 2024-08-09 14:02:10,965] A new study created in memory with name: no-name-ccb3bd0c-e237-4adc-8102-f5813e5b2223
[I 2024-08-09 14:06:44,268] Trial 0 finished with value: 0.4118753486510053 and parameters: {'learning_rate': 0.012872497546016786, 'max_depth': 6, 'subsample': 0.9391041691172688, 'colsample_bytree': 0.6141939459405702, 'min_child_weight': 15, 'alpha': 0.23468337509956055}. Best is trial 0 with value: 0.4118753486510053.
[I 2024-08-09 14:11:02,930] Trial 1 finished with value: 0.38755324887250997 and parameters: {'learning_rate': 0.07650705283870045, 'max_depth': 6, 'subsample': 0.4533522978308178, 'colsample_bytree': 0.8591121455956505, 'min_child_weight': 10, 'alpha': 0.589618908662295}. Best is trial 0 with value: 0.4118753486510053.
[I 2024-08-09 14:14:22,842] Trial 2 finished with value: 0.3921364440723213 and parameters: {'learning_rate': 0.09170188096934946, 'max_depth': 5, 'subsample': 0.33342411102548886, 'colsample_bytree': 0.7540328021603389, 'min_child_weight':

---

In [0]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_rank
from optuna.visualization import plot_slice
from optuna.visualization import plot_timeline

In [0]:
plot_parallel_coordinate(study)

In [0]:
plot_param_importances(study)

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [0]:
plot_contour(study, params=["alpha", "learning_rate"])

In [0]:
plot_optimization_history(study)