In [1]:
import pandas as pd
import sklearn.model_selection as sk
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import mlflow
import numpy as np
import hyperopt
from mlflow.tracking.client import MlflowClient

In [13]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

def train(params):
    """
    An example train method that computes the square of the input.
    This method will be passed to `hyperopt.fmin()`.

    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    
    max_depth, alpha = params
    
    max_depth = int(max_depth)
    df = pd.read_csv("1996_airline.csv", names=["Year","Month",
                                             "DayofMonth","DayofWeek",
                                             "CRSDepTime","CRSArrTime","UniqueCarer",
                                             "FlightNum","ActualElapsedTime",
                                             "Origin","Dest","Distance","Diverted","ArrDelay"])
    # Split the data into training and test sets. (0.75, 0.25) split.
    df['ArrDelayBinary'] = 1.0* (df["ArrDelay"] > 10)

    df['Dest'] = df['Dest'].astype('category').cat.codes.astype('int')
    df['Origin'] = df['Origin'].astype('category').cat.codes.astype('int')
    df['UniqueCarer'] = df['UniqueCarer'].astype('category').cat.codes.astype('int')

    X = df[df.columns.difference(["ArrDelay", "ArrDelayBinary"])]
    y = df["ArrDelayBinary"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    mod = xgb.XGBClassifier(tree_method='hist',
                              max_depth=max_depth,
                              alpha=alpha,
                           n_estimators=10)
    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    
    def eval_metrics(actual, pred):
        acc = accuracy_score(actual, pred)
        return acc
    
    acc = eval_metrics(y_test, preds)
    
    return {'loss': acc, 'status': STATUS_OK}

In [14]:
search_space = [
        hp.uniform('max_depth', 5,10),
        hp.uniform('alpha', .0, 1.0),
    ]
algo=tpe.suggest
import mlflow

with mlflow.start_run():
    argmin = fmin(
      fn=train,
      space=search_space,
      algo=algo,
      max_evals=2)

100%|██████████| 2/2 [01:01<00:00, 30.70s/trial, best loss: 0.7472832398792167]
