In [None]:
# Xgboost implementation of Extreme Gradient Boosted trees model tuned with Tree Parzen Estimator algorithm
# This approach can be used against other models such as tensorflow nets built with keras

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from hyperopt import fmin, tpe, hp, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

In [2]:
"""hyperparameter search spaces"""
params = {
  'max_depth': scope.int(hp.quniform('max_depth', 4, 27, 1)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
  'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
  'min_split_loss': hp.loguniform('min_split_loss', -6, 0),
  'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
  'objective': 'binary:logistic',
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'colsample_bylevel': hp.quniform('colsample_bylevel', 0.5, 1, 0.05),
    'colsample_bynode': hp.quniform('colsample_bynode', 0.5, 1, 0.05),
    'subsample': hp.quniform('subsample', 0.4, 1, 0.05),
    'grow_policy': hp.choice('grow_policy', ['depthwise', 'lossguide']),
    'tree_method': 'hist',
  'seed': 123
}

In [3]:
"""Load training data using pandas - for purposes of dev this uses subset of training data"""
"""The data loaded would contain engineered features where nan values were handled"""
def load_data():
    data = pd.read_csv("datasets/avazu-ctr-prediction/train.csv", nrows=100000)
    y_labels = data["click"]
    x_data = data.drop(["id", "click"], axis=1)
    x_data = x_data.apply(lambda x: x.astype("category").cat.codes if x.name in ["site_id", 
                                                                                 "site_domain",
                                                                                "site_category",
                                                                                "app_id",
                                                                                "app_domain",
                                                                                "app_category",
                                                                                "device_id",
                                                                                "device_ip",
                                                                                "device_model"] else x)
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_labels, test_size=.3)  # split train/val 70/30
    return x_train, x_val, y_train, y_val
    
def xgb_matrices(x_train, x_val, y_train, y_val):
    train = xgb.DMatrix(data=x_train, label=y_train)
    val = xgb.DMatrix(data=x_val, label=y_val)
    return train, val

In [4]:
"""Train xgboost model - hyperopt passes different param confiurations to this function and mlflow logs results"""
def train_model(params, data):
    train, val = data
    mlflow.xgboost.autolog()
    with mlflow.start_run(nested=True, experiment_id=1):
        mod = xgb.train(params=params, dtrain=train, num_boost_round=1000,
                        early_stopping_rounds=40,
                        verbose_eval=True, evals=[(val, 'val')])
        mlflow.xgboost.log_model(mod, "model")
        return {'status': STATUS_OK, 'loss': mod.best_score, 'booster': mod.attributes()}

In [None]:
"""Start mlflow experiment and save logloss & AUC performance metrics to tracking server"""
x_train, x_val, y_train, y_val = load_data()
train, val = xgb_matrices(x_train, x_val, y_train, y_val)
fmin_train_model = partial(train_model, data=[train, val])
with mlflow.start_run(run_name='xgboost_models', experiment_id=1):
    best_params = fmin(
        fn=fmin_train_model,
        space=params,
        algo=tpe.suggest,
        max_evals=96,
        rstate=np.random.RandomState(123)
    )
      

  0%|          | 0/96 [00:00<?, ?trial/s, best loss=?]





[0]	val-logloss:0.62633                               
[1]	val-logloss:0.57738                               
[2]	val-logloss:0.54062                               
[3]	val-logloss:0.51231                               
[4]	val-logloss:0.49084                               
[5]	val-logloss:0.47407                               
[6]	val-logloss:0.46071                               
[7]	val-logloss:0.45039                               
[8]	val-logloss:0.44205                               
[9]	val-logloss:0.43602                               
[10]	val-logloss:0.43075                              
[11]	val-logloss:0.42675                              
[12]	val-logloss:0.42347                              
[13]	val-logloss:0.42107                              
[14]	val-logloss:0.41910                              
[15]	val-logloss:0.41748                              
[16]	val-logloss:0.41620                              
[17]	val-logloss:0.41505                              
[18]	val-l



  1%|1         | 1/96 [00:06<09:35,  6.05s/trial, best loss: 0.409446]





[0]	val-logloss:0.62370                                               
[1]	val-logloss:0.57367                                               
[2]	val-logloss:0.53636                                               
[3]	val-logloss:0.50826                                               
[4]	val-logloss:0.48681                                               
[5]	val-logloss:0.47053                                               
[6]	val-logloss:0.45808                                               
[7]	val-logloss:0.44851                                               
[8]	val-logloss:0.44100                                               
[9]	val-logloss:0.43524                                               
[10]	val-logloss:0.43044                                              
[11]	val-logloss:0.42689                                              
[12]	val-logloss:0.42404                                              
[13]	val-logloss:0.42166                                              
[14]	v



  2%|2         | 2/96 [00:24<20:41, 13.21s/trial, best loss: 0.409446]





[0]	val-logloss:0.56581                                               
[1]	val-logloss:0.50068                                               
[2]	val-logloss:0.46423                                               
[3]	val-logloss:0.44315                                               
[4]	val-logloss:0.43088                                               
[5]	val-logloss:0.42374                                               
[6]	val-logloss:0.41953                                               
[7]	val-logloss:0.41690                                               
[8]	val-logloss:0.41448                                               
[9]	val-logloss:0.41351                                               
[10]	val-logloss:0.41271                                              
[11]	val-logloss:0.41251                                              
[12]	val-logloss:0.41214                                              
[13]	val-logloss:0.41170                                              
[14]	v



  3%|3         | 3/96 [00:31<15:57, 10.29s/trial, best loss: 0.409446]





[0]	val-logloss:0.63924                                               
[1]	val-logloss:0.59725                                               
[2]	val-logloss:0.56429                                               
[3]	val-logloss:0.53796                                               
[4]	val-logloss:0.51692                                               
[5]	val-logloss:0.49959                                               
[6]	val-logloss:0.48493                                               
[7]	val-logloss:0.47339                                               
[8]	val-logloss:0.46388                                               
[9]	val-logloss:0.45620                                               
[10]	val-logloss:0.45004                                              
[11]	val-logloss:0.44495                                              
[12]	val-logloss:0.44057                                              
[13]	val-logloss:0.43703                                              
[14]	v



  4%|4         | 4/96 [01:04<29:44, 19.40s/trial, best loss: 0.409446]





[0]	val-logloss:0.63817                                               
[1]	val-logloss:0.59529                                               
[2]	val-logloss:0.56131                                               
[3]	val-logloss:0.53402                                               
[4]	val-logloss:0.51196                                               
[5]	val-logloss:0.49375                                               
[6]	val-logloss:0.47887                                               
[7]	val-logloss:0.46671                                               
[8]	val-logloss:0.45675                                               
[9]	val-logloss:0.44847                                               
[10]	val-logloss:0.44172                                              
[11]	val-logloss:0.43604                                              
[12]	val-logloss:0.43138                                              
[13]	val-logloss:0.42742                                              
[14]	v



  5%|5         | 5/96 [01:12<23:03, 15.20s/trial, best loss: 0.409446]





[0]	val-logloss:0.64494                                               
[1]	val-logloss:0.60598                                               
[2]	val-logloss:0.57384                                               
[3]	val-logloss:0.54741                                               
[4]	val-logloss:0.52548                                               
[5]	val-logloss:0.50720                                               
[6]	val-logloss:0.49172                                               
[7]	val-logloss:0.47868                                               
[8]	val-logloss:0.46757                                               
[9]	val-logloss:0.45810                                               
[10]	val-logloss:0.45040                                              
[11]	val-logloss:0.44381                                              
[12]	val-logloss:0.43821                                              
[13]	val-logloss:0.43343                                              
[14]	v

In [None]:
"""TODO take winning model and log performance metrics against test dataset"""

In [None]:
"""Results of trained models can be seen on the mlflow tracking server 127.0.0.1:5000"""