In [1]:
# Xgboost implementation of Extreme Gradient Boosted trees model tuned with Tree Parzen Estimator algorithm
# This approach can be used against other models such as tensorflow nets built with keras

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from hyperopt import fmin, tpe, hp
from hyperopt.pyll import scope

In [3]:
"""hyperparameter search spaces"""
params = {
  'max_depth': scope.int(hp.quniform('max_depth', 4, 27, 1)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
  'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
  'min_split_loss': hp.loguniform('min_split_loss', -6, 0),
  'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
  'objective': 'binary:logistic',
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'colsample_bylevel': hp.quniform('colsample_bylevel', 0.5, 1, 0.05),
    'colsample_bynode': hp.quniform('colsample_bynode', 0.5, 1, 0.05),
    'subsample': hp.quniform('subsample', 0.4, 1, 0.05),
    'grow_policy': hp.choice('grow_policy', ['depthwise', 'lossguide']),
    'tree_method': 'gpu_hist',
  'seed': 123
}

In [4]:
"""Load training data using pandas - for purposes of dev this uses subset of training data"""
"""The data loaded would contain engineered features where nan values were handled"""
def load_data():
    data = pd.read_csv("datasets/avazu-ctr-prediction/train.csv", nrows=100)
    x_data = data["id"]
    y_labels = data["click"]
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_labels, test_size=.3)  # split train/val 70/30
    return x_train, x_val, y_train, y_val
    
def xgb_matrices(x_train, x_val, y_train, y_val):
    train = xgb.DMatrix(data=x_train, label=y_train)
    val = xgb.DMatrix(data=X_val, label=y_val)
    return train, val

In [5]:
"""Train xgboost model - hyperopt passes different param confiurations to this function and mlflow logs results"""
def train_model(params, data):
    train, val = data
    mlflow.xgboost.autolog()
    with mlflow.start_run(nested=True, experiment_id=1):
        mod = xgb.train(params=params, dtrain=train, num_boost_round=1000,
                        early_stopping_rounds=20,
                        verbose_eval=True, evals=[(val, 'val')])

In [6]:
"""Start mlflow experiment and save logloss & AUC performance metrics to tracking server"""
x_train, x_val, y_train, y_val = load_data()
train, val = xgb_matrices(x_train, x_val, y_train, y_val)
fmin_train_model = partial(train_model, data=[train, val])
with mlflow.start_run(run_name='xgboost_models', experiment_id=1):
    best_params = fmin(
        fn=fmin_train_model,
        space=params,
        algo=tpe.suggest,
        max_evals=96,
        rstate=np.random.RandomState(123)
    )
      

ValueError: ('Expecting 2 dimensional numpy.ndarray, got: ', (70,))

In [None]:
"""TODO take winning model and log performance metrics against test dataset"""b

In [None]:
"""Results of trained models can be seen on the mlflow tracking server 127.0.0.1:5000"""