In [None]:
import pandas as pd
import os
dir_path='C:\\Data Science\\xgb_native_api\\Training'
csv_file_name = 'Features_Variant_1.csv'
df=pd.read_csv(os.path.join(dir_path,csv_file_name),header=None)
df.sample(n=5)

In [None]:
print("Dataset has {} entries and {} features".format(*df.shape))

In [None]:
X, y = df.loc[:,:52].values, df.loc[:,53].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1, random_state=42)

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
from sklearn.metrics import mean_absolute_error
import numpy as np
# "Learn" the mean from the training data
mean_train = np.mean(y_train)

# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train

# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)

print("Baseline MAE is {:.2f}".format(mae_baseline))

In [None]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

In [None]:
params['eval_metric'] = "mae"
num_boost_round = 999

In [18]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:5.97478
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:5.03359
[2]	Test-mae:4.64572
[3]	Test-mae:4.42331
[4]	Test-mae:4.39328
[5]	Test-mae:4.35544
[6]	Test-mae:4.31315
[7]	Test-mae:4.33087
[8]	Test-mae:4.37164
[9]	Test-mae:4.38774
[10]	Test-mae:4.39443
[11]	Test-mae:4.40661
[12]	Test-mae:4.39124
[13]	Test-mae:4.39088
[14]	Test-mae:4.39827
[15]	Test-mae:4.39104
[16]	Test-mae:4.40307
Stopping. Best iteration:
[6]	Test-mae:4.31315



In [19]:
print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MAE: 4.31 with 7 rounds


In [20]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)

cv_results

Unnamed: 0,test-mae-mean,test-mae-std,train-mae-mean,train-mae-std
0,5.689189,0.270149,5.604765,0.064495
1,4.849525,0.271883,4.622477,0.065106
2,4.468342,0.239475,4.05971,0.065772
3,4.268584,0.224462,3.722983,0.06082
4,4.192448,0.189762,3.510303,0.061203
5,4.172856,0.189612,3.367213,0.061021
6,4.15786,0.192572,3.245549,0.060276
7,4.143254,0.19444,3.151495,0.062612
8,4.147843,0.196197,3.082321,0.05902
9,4.144657,0.189785,3.016803,0.057321


In [21]:
cv_results['test-mae-mean'].min()

4.0827876000000005

In [22]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [23]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )

    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5




	MAE 4.04524 for 6 rounds
CV with max_depth=9, min_child_weight=6
	MAE 4.0764622 for 5 rounds
CV with max_depth=9, min_child_weight=7
	MAE 4.0753928 for 5 rounds
CV with max_depth=10, min_child_weight=5
	MAE 4.0805826000000005 for 5 rounds
CV with max_depth=10, min_child_weight=6
	MAE 4.035100600000001 for 5 rounds
CV with max_depth=10, min_child_weight=7
	MAE 4.0872416000000005 for 5 rounds
CV with max_depth=11, min_child_weight=5
	MAE 4.062633 for 5 rounds
CV with max_depth=11, min_child_weight=6
	MAE 4.054831999999999 for 5 rounds
CV with max_depth=11, min_child_weight=7
	MAE 4.0581036 for 5 rounds
Best params: 10, 6, MAE: 4.035100600000001


In [25]:
params['max_depth'] = 10
params['min_child_weight'] = 6

In [26]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [27]:
min_mae = float("Inf")
best_params = None

# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))

    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )

    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)

print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))


CV with subsample=1.0, colsample=1.0




	MAE 4.035100600000001 for 5 rounds
CV with subsample=1.0, colsample=0.9
	MAE 4.053543 for 5 rounds
CV with subsample=1.0, colsample=0.8
	MAE 4.0752204 for 5 rounds
CV with subsample=1.0, colsample=0.7
	MAE 4.126817600000001 for 5 rounds
CV with subsample=0.9, colsample=1.0
	MAE 4.0747636 for 6 rounds
CV with subsample=0.9, colsample=0.9
	MAE 4.301792 for 5 rounds
CV with subsample=0.9, colsample=0.8
	MAE 4.2715806 for 4 rounds
CV with subsample=0.9, colsample=0.7
	MAE 4.4203404 for 11 rounds
CV with subsample=0.8, colsample=1.0
	MAE 4.058284800000001 for 5 rounds
CV with subsample=0.8, colsample=0.9
	MAE 4.3079596 for 7 rounds
CV with subsample=0.8, colsample=0.8
	MAE 4.3022836 for 4 rounds
CV with subsample=0.8, colsample=0.7
	MAE 4.419911600000001 for 8 rounds
CV with subsample=0.7, colsample=1.0
	MAE 4.0859668 for 5 rounds
CV with subsample=0.7, colsample=0.9
	MAE 4.2999216 for 8 rounds
CV with subsample=0.7, colsample=0.8
	MAE 4.3593908 for 7 rounds
CV with subsample=0.7, colsampl

In [33]:
# This can take some time…
%time
min_mae = float("Inf")
best_params = None

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params['eta'] = eta

# Run and time CV
    %time 
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
          )

    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta

    print("Best params: {}, MAE: {}".format(best_params, min_mae))

Wall time: 0 ns
CV with eta=0.3
Wall time: 0 ns




	MAE 4.4839162 for 8 rounds

Best params: 0.3, MAE: 4.4839162
CV with eta=0.2
Wall time: 0 ns




	MAE 4.322296 for 12 rounds

Best params: 0.2, MAE: 4.322296
CV with eta=0.1
Wall time: 0 ns




	MAE 4.0556636 for 25 rounds

Best params: 0.1, MAE: 4.0556636
CV with eta=0.05
Wall time: 0 ns




	MAE 3.9623722 for 55 rounds

Best params: 0.05, MAE: 3.9623722
CV with eta=0.01
Wall time: 0 ns




	MAE 3.895253399999999 for 269 rounds

Best params: 0.01, MAE: 3.895253399999999
CV with eta=0.005
Wall time: 0 ns
	MAE 3.8831075999999998 for 530 rounds

Best params: 0.005, MAE: 3.8831075999999998




In [34]:
params['eta'] = .01

In [35]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:7.69225
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:7.62402
[2]	Test-mae:7.56704
[3]	Test-mae:7.5034
[4]	Test-mae:7.43531
[5]	Test-mae:7.37917
[6]	Test-mae:7.31423
[7]	Test-mae:7.25928
[8]	Test-mae:7.20004
[9]	Test-mae:7.14086
[10]	Test-mae:7.08814
[11]	Test-mae:7.03727
[12]	Test-mae:6.99024
[13]	Test-mae:6.94458
[14]	Test-mae:6.88791
[15]	Test-mae:6.8311
[16]	Test-mae:6.78486
[17]	Test-mae:6.72895
[18]	Test-mae:6.68487
[19]	Test-mae:6.63521
[20]	Test-mae:6.5867
[21]	Test-mae:6.53298
[22]	Test-mae:6.48231
[23]	Test-mae:6.44161
[24]	Test-mae:6.39147
[25]	Test-mae:6.34277
[26]	Test-mae:6.29427
[27]	Test-mae:6.24632
[28]	Test-mae:6.19971
[29]	Test-mae:6.16322
[30]	Test-mae:6.11903
[31]	Test-mae:6.07511
[32]	Test-mae:6.0299
[33]	Test-mae:5.98261
[34]	Test-mae:5.93708
[35]	Test-mae:5.8973
[36]	Test-mae:5.85705
[37]	Test-mae:5.815
[38]	Test-mae:5.77409
[39]	Test-mae:5.73228
[40]	Test-mae:5.69574
[41]	Test-mae:5.6606
[42]	Test-mae:5.62511
[43]	Test-mae:5.

In [36]:
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best MAE: 3.89 in 214 rounds


In [37]:
num_boost_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:7.69225
[1]	Test-mae:7.62402
[2]	Test-mae:7.56704
[3]	Test-mae:7.5034
[4]	Test-mae:7.43531
[5]	Test-mae:7.37917
[6]	Test-mae:7.31423
[7]	Test-mae:7.25928
[8]	Test-mae:7.20004
[9]	Test-mae:7.14086
[10]	Test-mae:7.08814
[11]	Test-mae:7.03727
[12]	Test-mae:6.99024
[13]	Test-mae:6.94458
[14]	Test-mae:6.88791
[15]	Test-mae:6.8311
[16]	Test-mae:6.78486
[17]	Test-mae:6.72895
[18]	Test-mae:6.68487
[19]	Test-mae:6.63521
[20]	Test-mae:6.5867
[21]	Test-mae:6.53298
[22]	Test-mae:6.48231
[23]	Test-mae:6.44161
[24]	Test-mae:6.39147
[25]	Test-mae:6.34277
[26]	Test-mae:6.29427
[27]	Test-mae:6.24632
[28]	Test-mae:6.19971
[29]	Test-mae:6.16322
[30]	Test-mae:6.11903
[31]	Test-mae:6.07511
[32]	Test-mae:6.0299
[33]	Test-mae:5.98261
[34]	Test-mae:5.93708
[35]	Test-mae:5.8973
[36]	Test-mae:5.85705
[37]	Test-mae:5.815
[38]	Test-mae:5.77409
[39]	Test-mae:5.73228
[40]	Test-mae:5.69574
[41]	Test-mae:5.6606
[42]	Test-mae:5.62511
[43]	Test-mae:5.59235
[44]	Test-mae:5.55786
[45]	Test-mae:5.53048
[46]	T

In [38]:
mean_absolute_error(best_model.predict(dtest), y_test)

3.8889745183475544

In [39]:
best_model.save_model("my_model.model")

In [40]:
loaded_model = xgb.Booster()
loaded_model.load_model("my_model.model")

# And use it for predictions.
loaded_model.predict(dtest)

array([3.7648475 , 0.30940798, 1.8136443 , ..., 3.2530515 , 0.09870756,
       3.8920937 ], dtype=float32)