We will start with the data pre-loaded into **train_X**, **test_X**, **train_y**, **test_y**.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [2]:
mlflow.list_experiments()

[<Experiment: artifact_location='./artifacts/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./artifacts/1', experiment_id='1', lifecycle_stage='active', name='Xgboost Mlops', tags={}>]

In [3]:
mlflow.set_experiment("Xgboost Mlops")

<Experiment: artifact_location='./artifacts/1', experiment_id='1', lifecycle_stage='active', name='Xgboost Mlops', tags={}>

In [12]:

# from sklearn.preprocessing import Imputer

data = pd.read_csv('train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['Id','SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

# my_imputer = Imputer()
# train_X = my_imputer.fit_transform(train_X)
# test_X = my_imputer.transform(test_X)


In [None]:
train_X

We build and fit a model just as we would in scikit-learn.

In [15]:
import xgboost as xgb

In [18]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.metrics import mean_squared_error

In [9]:
mlflow.xgboost.autolog()

In [16]:
train = xgb.DMatrix(train_X, label=train_y)
valid = xgb.DMatrix(test_X, label=test_y) 

In [19]:
def objective(params):
    with mlflow.start_run():
        # mlflow.set_tag("model", "xgboost")
        # mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(test_y, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}


In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)


### MLflow's Model Registry

In [None]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))