In this notebook we train and save a GBDT model for the HELOC dataset.

## Setup

In [1]:
import os

while "notebooks" in os.getcwd():
    os.chdir("../")

from pathlib import Path
import xgboost as xgb
import pandas as pd

%load_ext autoreload
%autoreload 2


In [10]:
dataset_path = Path("data/HELOC")
train_path = dataset_path / "heloc-train.csv"
test_path = dataset_path / "heloc-test.csv"
model_path = Path("models/HELOC")
first_run = False


## Load the data

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


### Data inspection

In [4]:
train_df


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,67.0,282.0,11.0,108.0,37.0,0.0,0.0,95.0,3.0,4.0,...,0.0,1.0,1.0,68.0,58.000000,5.0,3.000000,2.0,80.0,0.0
1,58.0,164.0,4.0,62.0,21.0,2.0,1.0,70.0,8.0,4.0,...,0.0,5.0,5.0,27.0,94.000000,5.0,4.000000,0.0,82.0,0.0
2,71.0,103.0,15.0,94.0,6.0,1.0,0.0,86.0,13.0,6.0,...,24.0,0.0,0.0,48.0,70.210422,2.0,2.563951,0.0,67.0,0.0
3,85.0,139.0,11.0,78.0,10.0,0.0,0.0,100.0,83.0,7.0,...,24.0,2.0,1.0,19.0,66.516262,1.0,2.397238,1.0,50.0,1.0
4,79.0,803.0,3.0,101.0,38.0,0.0,0.0,100.0,83.0,7.0,...,3.0,2.0,2.0,14.0,66.516262,4.0,2.000000,0.0,46.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7891,69.0,292.0,6.0,121.0,28.0,3.0,3.0,90.0,64.0,6.0,...,0.0,4.0,4.0,18.0,93.000000,1.0,4.000000,0.0,55.0,0.0
7892,76.0,201.0,8.0,71.0,38.0,0.0,0.0,97.0,51.0,6.0,...,0.0,0.0,0.0,0.0,47.000000,1.0,5.000000,0.0,35.0,1.0
7893,58.0,211.0,5.0,54.0,39.0,0.0,0.0,95.0,2.0,4.0,...,0.0,1.0,1.0,25.0,43.000000,5.0,3.000000,3.0,47.0,0.0
7894,69.0,78.0,5.0,40.0,16.0,0.0,0.0,100.0,83.0,7.0,...,0.0,2.0,2.0,69.0,70.210422,3.0,2.000000,2.0,83.0,0.0


In [5]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7896 entries, 0 to 7895
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ExternalRiskEstimate                7896 non-null   float64
 1   MSinceOldestTradeOpen               7896 non-null   float64
 2   MSinceMostRecentTradeOpen           7896 non-null   float64
 3   AverageMInFile                      7896 non-null   float64
 4   NumSatisfactoryTrades               7896 non-null   float64
 5   NumTrades60Ever2DerogPubRec         7896 non-null   float64
 6   NumTrades90Ever2DerogPubRec         7896 non-null   float64
 7   PercentTradesNeverDelq              7896 non-null   float64
 8   MSinceMostRecentDelq                7896 non-null   float64
 9   MaxDelq2PublicRecLast12M            7896 non-null   float64
 10  MaxDelqEver                         7896 non-null   float64
 11  NumTotalTrades                      7896 no

## Train the model
No need to run this if the model already exists.

In [6]:
# training hyperparameters
param = {
    'eta': 0.01,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'seed': 42,
}
steps = 500


In [7]:
# the last column is the target
dtrain = xgb.DMatrix(train_df.iloc[:, :-1], label=train_df.iloc[:, -1])
dtest = xgb.DMatrix(test_df.iloc[:, :-1], label=test_df.iloc[:, -1])


In [8]:
%%time
if first_run:
    gbdt_model = xgb.train(param, dtrain,
                      num_boost_round=steps,
                      evals=[(dtest, 'test'), (dtrain, 'train')],
                      verbose_eval=50)


[0]	test-logloss:0.68849	train-logloss:0.68836
[50]	test-logloss:0.53465	train-logloss:0.53052
[100]	test-logloss:0.46158	train-logloss:0.45494
[150]	test-logloss:0.42393	train-logloss:0.41524
[200]	test-logloss:0.40428	train-logloss:0.39178
[250]	test-logloss:0.38737	train-logloss:0.37176
[300]	test-logloss:0.37664	train-logloss:0.35775
[350]	test-logloss:0.36911	train-logloss:0.34699
[400]	test-logloss:0.36353	train-logloss:0.33859
[450]	test-logloss:0.35928	train-logloss:0.33150
[499]	test-logloss:0.35674	train-logloss:0.32597
CPU times: user 3h 21min 22s, sys: 9min 54s, total: 3h 31min 16s
Wall time: 3min 32s


### Save/load the model
* `save_model()` is for saving and loading.
* `dump_model()` is for model exporting which may be used for further model interpretation, for example visualization.

In [9]:
if first_run:
    gbdt_model.save_model(model_path / "gbdt_saved.json")
    gbdt_model.dump_model(model_path / "gbdt_dumped.txt", with_stats=True)
else:
    gbdt_model = xgb.Booster()
    gbdt_model.load_model(model_path / "gbdt_saved.json")
    