We will start with the data pre-loaded into **train_X**, **test_X**, **train_y**, **test_y**.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [4]:
mlflow.list_experiments()

[<Experiment: artifact_location='./artifacts/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./artifacts/1', experiment_id='1', lifecycle_stage='active', name='Xgboost Mlops', tags={}>]

In [5]:
mlflow.set_experiment("Xgboost Mlops")

2022/08/24 09:17:52 INFO mlflow.tracking.fluent: Experiment with name 'house-price-predictor' does not exist. Creating a new experiment.


<Experiment: artifact_location='./artifacts/2', experiment_id='2', lifecycle_stage='active', name='house-price-predictor', tags={}>

In [27]:

# from sklearn.preprocessing import Imputer

data = pd.read_csv('train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['Id','SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

# my_imputer = Imputer()
# train_X = my_imputer.fit_transform(train_X)
# test_X = my_imputer.transform(test_X)


In [29]:
train_X.dtypes

MSSubClass         int64
LotFrontage      float64
LotArea            int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageYrBlt      float64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
MoSold             int64
YrSold             int64
dtype: object

In [10]:
data = pd.read_csv('test.csv')

In [24]:
data.iloc[0].to_json()

'{"Id":1461,"MSSubClass":20,"MSZoning":"RH","LotFrontage":80.0,"LotArea":11622,"Street":"Pave","Alley":null,"LotShape":"Reg","LandContour":"Lvl","Utilities":"AllPub","LotConfig":"Inside","LandSlope":"Gtl","Neighborhood":"NAmes","Condition1":"Feedr","Condition2":"Norm","BldgType":"1Fam","HouseStyle":"1Story","OverallQual":5,"OverallCond":6,"YearBuilt":1961,"YearRemodAdd":1961,"RoofStyle":"Gable","RoofMatl":"CompShg","Exterior1st":"VinylSd","Exterior2nd":"VinylSd","MasVnrType":"None","MasVnrArea":0.0,"ExterQual":"TA","ExterCond":"TA","Foundation":"CBlock","BsmtQual":"TA","BsmtCond":"TA","BsmtExposure":"No","BsmtFinType1":"Rec","BsmtFinSF1":468.0,"BsmtFinType2":"LwQ","BsmtFinSF2":144.0,"BsmtUnfSF":270.0,"TotalBsmtSF":882.0,"Heating":"GasA","HeatingQC":"TA","CentralAir":"Y","Electrical":"SBrkr","1stFlrSF":896,"2ndFlrSF":0,"LowQualFinSF":0,"GrLivArea":896,"BsmtFullBath":0.0,"BsmtHalfBath":0.0,"FullBath":1,"HalfBath":0,"BedroomAbvGr":2,"KitchenAbvGr":1,"KitchenQual":"TA","TotRmsAbvGrd":5,"Fu

We build and fit a model just as we would in scikit-learn.

In [5]:
import xgboost as xgb

In [6]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.metrics import mean_squared_error

In [7]:
mlflow.xgboost.autolog()

In [8]:
train = xgb.DMatrix(train_X, label=train_y)
valid = xgb.DMatrix(test_X, label=test_y) 

In [19]:
def objective(params):
    with mlflow.start_run():
        # mlflow.set_tag("model", "xgboost")
        # mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(test_y, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}


In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)


### Train best model

In [12]:
with mlflow.start_run():
    
    train = xgb.DMatrix(train_X, label=train_y)
    valid = xgb.DMatrix(test_X, label=test_y) 

    best_params = {'learning_rate': '0.42649323727247074', 
                    'max_depth': '4',
                    'min_child_weight': '2.823388102868533',
                    'objective': 'reg:squarederror',
                    'reg_alpha': '0.08394020560827088',
                    'reg_lambda': '0.1989734719817237',
                    'seed': '42'
                    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(test_y, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")


[0]	validation-rmse:119564.69310
[1]	validation-rmse:76398.16653
[2]	validation-rmse:52530.96600
[3]	validation-rmse:40110.38098
[4]	validation-rmse:34256.56197
[5]	validation-rmse:31581.67299
[6]	validation-rmse:30945.68676
[7]	validation-rmse:30049.62556
[8]	validation-rmse:29251.87455
[9]	validation-rmse:28921.00821
[10]	validation-rmse:28552.13384
[11]	validation-rmse:28432.95239
[12]	validation-rmse:28752.21077
[13]	validation-rmse:28701.93433
[14]	validation-rmse:28725.42415
[15]	validation-rmse:29038.95417
[16]	validation-rmse:29027.25971
[17]	validation-rmse:28828.82494
[18]	validation-rmse:28634.54927
[19]	validation-rmse:28665.19839
[20]	validation-rmse:28612.91402
[21]	validation-rmse:28646.70754
[22]	validation-rmse:28670.67273
[23]	validation-rmse:28876.77747
[24]	validation-rmse:28844.61152
[25]	validation-rmse:28750.79896
[26]	validation-rmse:28808.15076
[27]	validation-rmse:28754.18357
[28]	validation-rmse:28687.95048
[29]	validation-rmse:28662.37963
[30]	validation-rms

