# fine tuning

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import collect_list, concat_ws, udf ,lit, col, when, split, size, lower, explode, dayofmonth
from pyspark.sql.types import *
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, when, lit, avg, stddev, month, year, unix_timestamp
#from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from xgboost.spark import SparkXGBRegressor as XGBRegressor
from xgboost.spark import SparkXGBClassifier as XGBClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.sql import DataFrame
import optuna

In [0]:
import warnings

warnings.filterwarnings("ignore", message="Loading a native XGBoost model with Scikit-Learn interface.")

# 1. rmse - min.

In [0]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    xgb = XGBRegressor(
        features_col="features", label_col="storedelivery%_label_col", **params
    )

    model = xgb.fit(train_df)
    predictions = model.transform(test_df)

    evaluator_rmse = RegressionEvaluator(
        labelCol="storedelivery%_label_col",
        predictionCol="prediction",
        metricName="rmse",
    )

    rmse = evaluator_rmse.evaluate(predictions)
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

print("Best hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

# 2. R2 - max.

In [0]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 2500,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 7),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 5, 15),
        "alpha": trial.suggest_float("alpha", 0.2, 0.7),
    }

    xgb = SparkXGBRegressor(features_col="features", label_col="label", **params)

    model = xgb.fit(train_df)
    predictions = model.transform(test_df)

    evaluator_r2 = RegressionEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="r2",
    )

    r2 = evaluator_r2.evaluate(predictions)
    return r2

study = create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best hyperparameters:", study.best_params)
print("Best R²:", study.best_value)

train and evaluate model

In [0]:
def xgb_evaluate(model, model_name, train_df, test_df, feature_columns, params=None):
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

    if params:
        for key, value in params.items():
            model.setParam(key, value)

    pipeline = Pipeline(stages=[assembler, scaler, model])
    pipeline_model = pipeline.fit(train_df)
    predictions = pipeline_model.transform(test_df)

    evaluator = RegressionEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="rmse",
    )

    rmse = evaluator.evaluate(predictions)
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

    print(f"{model_name} - RMSE: {rmse}, R2: {r2}")
    xgb_feature_importance(pipeline_model, model_name, feature_columns)

In [0]:
xgb_params = {
    "learningRate": 0.0043,
    "maxDepth": 6,
    "subsample": 0.7372,
    "colsampleBytree": 0.785,
    "minChildWeight": 6,
    "alpha": 0.5725,
    "objective": "reg:squarederror",
    "numRound": 2500,
}

exclude_columns = ["label"]

if "features" in df_xgb.columns:
    df_xgb = df_xgb.drop("features")

feature_columns = get_all_column_names(df_xgb, exclude_columns)
train_df, test_df = df_xgb.randomSplit([0.7, 0.3], seed=27)

xgb_model = XGBRegressor(label_col="label")
xgb_model.setParams(**xgb_params)

xgb_evaluate(xgb_model, "XGBoost", train_df, test_df, feature_columns)

# Visualisation
[optuna web](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/005_visualization.html)

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision


import optuna

# You can use Matplotlib instead of Plotly for visualization by simply replacing `optuna.visualization` with
# `optuna.visualization.matplotlib` in the following examples.
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_rank
from optuna.visualization import plot_slice
from optuna.visualization import plot_timeline

without tuning

In [0]:
def xgb_feature_importance(model, model_name, feature_columns):
    booster = model.stages[-1].get_booster()
    xgb_feature_importance_dict = booster.get_score(importance_type="weight")
    importance = [
        xgb_feature_importance_dict.get(f"f{i}", 0.0)
        for i in range(len(feature_columns))
    ]
    xgb_feature_importance_list = list(zip(feature_columns, importance))
    xgb_feature_importance_list.sort(key=lambda x: x[1], reverse=True)
    print(f"Feature Importances for {model_name}:")
    for feature, importance in xgb_feature_importance_list:
        print(f"{feature}: {importance}")
    print()

def xgb_evaluate(model, model_name, train_df, test_df, feature_columns):
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
    pipeline = Pipeline(stages=[assembler, scaler, model])
    pipeline_model = pipeline.fit(train_df)
    predictions = pipeline_model.transform(test_df)
    evaluator = RegressionEvaluator(
        labelCol="storedelivery%_label_col", predictionCol="prediction", metricName="rmse"
    )
    rmse = evaluator.evaluate(predictions)
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
    print(f"{model_name} - RMSE: {rmse}, R2: {r2}")
    xgb_feature_importance(pipeline_model, model_name, feature_columns)

In [0]:
if 'features' in df.columns: df_3 = df_3.drop('features')

feature_columns = get_all_column_names(df_3, exclude_columns)
train_df, test_df = df_3.randomSplit([0.7, 0.3], seed=27)
xgb_4 = SparkXGBRegressor(features_col="scaledFeatures", label_col="storedelivery%_label_col")
xgb_evaluate(xgb_4, "XGBoost", train_df, test_df, feature_columns)