In [2]:
import xgboost
import pkgutil
import os
import numpy as np
import importlib, inspect
import warnings
warnings.filterwarnings("ignore")
import pickle

In [8]:
compas_training = pickle.load(open("./data/pickle_pandas_tabular_compas_training.sav", "rb"))
compas_testing = pickle.load(open("./data/pickle_pandas_tabular_compas_testing.sav", "rb"))

# Consider to do more tests
# cardio_testing = pickle.load(open("./data/pickle_pandas_tabular_cardio_testing.sav", "rb"))
# cardio_training = pickle.load(open("./data/pickle_pandas_tabular_cardio_training.sav", "rb"))

X_train = compas_training.drop("two_year_recid", axis=1)
y_train = compas_training[["two_year_recid"]]
X_test = compas_testing.drop("two_year_recid", axis=1)
y_test = compas_testing[["two_year_recid"]]

save_dir = "./models/xgboost/{0}".format(xgboost.__version__)
os.mkdir(save_dir)

# Generate xgboost model samples for rocket

In [4]:
xgboost.__all__

['DMatrix',
 'DeviceQuantileDMatrix',
 'QuantileDMatrix',
 'Booster',
 'DataIter',
 'train',
 'cv',
 'RabitTracker',
 'build_info',
 'plot_importance',
 'plot_tree',
 'to_graphviz',
 'set_config',
 'get_config',
 'config_context',
 'XGBModel',
 'XGBClassifier',
 'XGBRegressor',
 'XGBRanker',
 'XGBRFClassifier',
 'XGBRFRegressor',
 'dask',
 'collective']

In [9]:
param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "binary:logistic"
}
# prepare the data in xgboost manner
dtrain = xgboost.DMatrix(X_train, y_train)
dtest = xgboost.DMatrix(X_test, y_test)

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.core.Booster\n")

pickle.dump(bst, open("{0}/binary_classification_compas_logistic_xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-logloss:0.63330	train-logloss:0.62683
[1]	eval-logloss:0.62669	train-logloss:0.61763
[2]	eval-logloss:0.62169	train-logloss:0.61114
[3]	eval-logloss:0.61613	train-logloss:0.60872
[4]	eval-logloss:0.61560	train-logloss:0.60734
[5]	eval-logloss:0.61458	train-logloss:0.60498
[6]	eval-logloss:0.61250	train-logloss:0.60392
[7]	eval-logloss:0.61243	train-logloss:0.60355
[8]	eval-logloss:0.61119	train-logloss:0.60307
[9]	eval-logloss:0.61126	train-logloss:0.60280
type: <class 'xgboost.core.Booster'>


In [10]:
param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "binary:logitraw"
}

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.core.Booster\n")

pickle.dump(bst, open("{0}/binary_classification_compas_logitraw_xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-logloss:6.13873	train-logloss:6.01503
[1]	eval-logloss:7.25572	train-logloss:7.24620
[2]	eval-logloss:7.04316	train-logloss:6.95921
[3]	eval-logloss:7.28631	train-logloss:7.29401
[4]	eval-logloss:7.72789	train-logloss:7.73330
[5]	eval-logloss:7.61887	train-logloss:7.62365
[6]	eval-logloss:7.62035	train-logloss:7.58637
[7]	eval-logloss:7.68383	train-logloss:7.62564
[8]	eval-logloss:7.63306	train-logloss:7.62119
[9]	eval-logloss:7.72224	train-logloss:7.67107
type: <class 'xgboost.core.Booster'>


## Classifier with reference to sklearn

In [11]:
classifier = xgboost.XGBClassifier().fit(X_train, y_train)  # train using booster class
predictions = classifier.predict(X_test)
print("type: {0}".format(type(classifier)))
print("score: {0}".format(classifier.score(X_test, y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.sklearn.XGBClassifier\n")
pickle.dump(classifier, open("{0}/binary_classification_compas_xgboost.sklearn.XGBClassifier.sav".format(save_dir), "wb+"))

type: <class 'xgboost.sklearn.XGBClassifier'>
score: 0.6647773279352227


In [12]:
fclassifier = xgboost.XGBRFClassifier().fit(X_train, y_train)  # train using booster class
predictions = fclassifier.predict(X_test)
print("type: {0}".format(type(fclassifier)))
print("score: {0}".format(fclassifier.score(X_test, y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.sklearn.XGBRFClassifier\n")
pickle.dump(fclassifier, open("{0}/binary_classification_compas_xgboost.sklearn.XGBRFClassifier.sav".format(save_dir), "wb+"))

type: <class 'xgboost.sklearn.XGBRFClassifier'>
score: 0.6696356275303643


## Multiclass Classification with reference to sklearn

In [13]:
loan_training = pickle.load(open("./data/pickle_pandas_tabular_loan_training.sav", "rb")).sample(5000)
loan_testing = pickle.load(open("./data/pickle_pandas_tabular_loan_testing.sav", "rb")).sample(2000)

multiclass_X_train = loan_training.drop("Interest_Rate", axis=1)
multiclass_y_train = loan_training[["Interest_Rate"]]
multiclass_X_test = loan_testing.drop("Interest_Rate", axis=1)
multiclass_y_test = loan_testing[["Interest_Rate"]]

param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "multi:softmax",
    "num_class": 3
}

dtrain = xgboost.DMatrix(multiclass_X_train, multiclass_y_train)
dtest = xgboost.DMatrix(multiclass_X_test, multiclass_y_test)

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.core.Booster\n")

pickle.dump(bst, open("{0}/multiclass_classification_insurance_softmax_xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-mlogloss:1.01658	train-mlogloss:1.01468
[1]	eval-mlogloss:0.98784	train-mlogloss:0.98607
[2]	eval-mlogloss:0.97057	train-mlogloss:0.96665
[3]	eval-mlogloss:0.96594	train-mlogloss:0.95502
[4]	eval-mlogloss:0.95925	train-mlogloss:0.94334
[5]	eval-mlogloss:0.95362	train-mlogloss:0.93342
[6]	eval-mlogloss:0.95234	train-mlogloss:0.92574
[7]	eval-mlogloss:0.94932	train-mlogloss:0.92016
[8]	eval-mlogloss:0.94830	train-mlogloss:0.91428
[9]	eval-mlogloss:0.94736	train-mlogloss:0.90989
type: <class 'xgboost.core.Booster'>


In [14]:
param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "multi:softprob",
    "num_class": 3
}

dtrain = xgboost.DMatrix(multiclass_X_train, multiclass_y_train)
dtest = xgboost.DMatrix(multiclass_X_test, multiclass_y_test)

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.core.Booster\n")

pickle.dump(bst, open("{0}/multiclass_classification_insurance_softprob_xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-mlogloss:1.01658	train-mlogloss:1.01468
[1]	eval-mlogloss:0.98784	train-mlogloss:0.98607
[2]	eval-mlogloss:0.97057	train-mlogloss:0.96665
[3]	eval-mlogloss:0.96594	train-mlogloss:0.95502
[4]	eval-mlogloss:0.95925	train-mlogloss:0.94334
[5]	eval-mlogloss:0.95362	train-mlogloss:0.93342
[6]	eval-mlogloss:0.95234	train-mlogloss:0.92574
[7]	eval-mlogloss:0.94932	train-mlogloss:0.92016
[8]	eval-mlogloss:0.94830	train-mlogloss:0.91428
[9]	eval-mlogloss:0.94736	train-mlogloss:0.90989
type: <class 'xgboost.core.Booster'>


In [15]:
classifier = xgboost.XGBClassifier().fit(multiclass_X_train, multiclass_y_train)  # train using booster class
predictions = classifier.predict(multiclass_X_test)
print("type: {0}".format(type(classifier)))
print("score: {0}".format(classifier.score(multiclass_X_test, multiclass_y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.sklearn.XGBClassifier\n")
pickle.dump(classifier, open("{0}/multiclass_classification_insurance_xgboost.sklearn.XGBClassifier.sav".format(save_dir), "wb+"))

type: <class 'xgboost.sklearn.XGBClassifier'>
score: 0.501


In [16]:
fclassifier = xgboost.XGBRFClassifier().fit(multiclass_X_train, multiclass_y_train)  # train using booster class
predictions = fclassifier.predict(multiclass_X_test)
print("type: {0}".format(type(fclassifier)))
print("score: {0}".format(fclassifier.score(multiclass_X_test, multiclass_y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.sklearn.XGBRFClassifier\n")
pickle.dump(fclassifier, open("{0}/multiclass_classification_compas_xgboost.sklearn.XGBRFClassifier.sav".format(save_dir), "wb+"))

type: <class 'xgboost.sklearn.XGBRFClassifier'>
score: 0.518


## Regressor with reference to sklearn

In [18]:
insurance_training = pickle.load(open("./data/pickle_pandas_tabular_insurance_training.sav", "rb"))
insurance_testing = pickle.load(open("./data/pickle_pandas_tabular_insurance_testing.sav", "rb"))
insurance_testing.describe()

regression_X_train = insurance_training.drop("charges", axis=1)
regression_y_train = insurance_training[["charges"]]
regression_X_test = insurance_testing.drop("charges", axis=1)
regression_y_test = insurance_testing[["charges"]]

In [20]:
param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "reg:absoluteerror"
}

dtrain = xgboost.DMatrix(regression_X_train, regression_y_train)
dtest = xgboost.DMatrix(regression_X_test, regression_y_test)

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

pickle.dump(bst, open("{0}/regression_insurance_absoluteerror_xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-mae:8326.12205	train-mae:8356.67071
[1]	eval-mae:8325.44141	train-mae:8355.92917
[2]	eval-mae:8324.76047	train-mae:8355.18728
[3]	eval-mae:8324.07983	train-mae:8354.44574
[4]	eval-mae:8323.39890	train-mae:8353.70386
[5]	eval-mae:8322.71826	train-mae:8352.96231
[6]	eval-mae:8322.03732	train-mae:8352.22043
[7]	eval-mae:8321.35668	train-mae:8351.47888
[8]	eval-mae:8320.67575	train-mae:8350.73700
[9]	eval-mae:8319.99510	train-mae:8349.99545
type: <class 'xgboost.core.Booster'>


In [22]:
param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "reg:squarederror"
}

dtrain = xgboost.DMatrix(regression_X_train, regression_y_train)
dtest = xgboost.DMatrix(regression_X_test, regression_y_test)

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

pickle.dump(bst, open("{0}/regression_insurance_squarederror_xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-rmse:5888.70176	train-rmse:4831.93904
[1]	eval-rmse:5728.07701	train-rmse:4486.40488
[2]	eval-rmse:5619.48775	train-rmse:4356.84327
[3]	eval-rmse:5468.14877	train-rmse:4248.49912
[4]	eval-rmse:5464.81089	train-rmse:4183.63028
[5]	eval-rmse:5372.62076	train-rmse:4133.60689
[6]	eval-rmse:5348.71832	train-rmse:4114.79173
[7]	eval-rmse:5288.35878	train-rmse:4095.70893
[8]	eval-rmse:5325.66886	train-rmse:4071.27483
[9]	eval-rmse:5314.62754	train-rmse:4053.73999
type: <class 'xgboost.core.Booster'>


In [23]:
param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "reg:pseudohubererror"
}

dtrain = xgboost.DMatrix(regression_X_train, regression_y_train)
dtest = xgboost.DMatrix(regression_X_test, regression_y_test)

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

pickle.dump(bst, open("{0}/regression_insurance_pseudohubererror_xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-mphe:13166.37782	train-mphe:13294.60636
[1]	eval-mphe:13166.37782	train-mphe:13294.60636
[2]	eval-mphe:13166.37782	train-mphe:13294.60636
[3]	eval-mphe:13166.37782	train-mphe:13294.60636
[4]	eval-mphe:13166.37782	train-mphe:13294.60636
[5]	eval-mphe:13166.37782	train-mphe:13294.60636
[6]	eval-mphe:13166.37782	train-mphe:13294.60636
[7]	eval-mphe:13166.37782	train-mphe:13294.60636
[8]	eval-mphe:13166.37782	train-mphe:13294.60636
[9]	eval-mphe:13166.37782	train-mphe:13294.60636
type: <class 'xgboost.core.Booster'>


In [24]:
param = {
    "max_depth": 2,
    "eta": 1,
    "objective": "reg:gamma"
}

dtrain = xgboost.DMatrix(regression_X_train, regression_y_train)
dtest = xgboost.DMatrix(regression_X_test, regression_y_test)

watchlist = [(dtest, "eval"), (dtrain, "train")]
bst = xgboost.train(param, dtrain, evals=watchlist) # train using booster class
print("type: {0}".format(type(bst)))

pickle.dump(bst, open("{0}/regression_insurance__xgboost.core.Booster.sav".format(save_dir), "wb+"))

[0]	eval-gamma-nloglik:9689.05343	train-gamma-nloglik:9783.40224
[1]	eval-gamma-nloglik:3565.96291	train-gamma-nloglik:3600.67547
[2]	eval-gamma-nloglik:1314.03492	train-gamma-nloglik:1326.80851
[3]	eval-gamma-nloglik:486.22844	train-gamma-nloglik:490.93113
[4]	eval-gamma-nloglik:182.32602	train-gamma-nloglik:184.05960
[5]	eval-gamma-nloglik:71.18666	train-gamma-nloglik:71.79382
[6]	eval-gamma-nloglik:30.91453	train-gamma-nloglik:31.10788
[7]	eval-gamma-nloglik:16.67969	train-gamma-nloglik:16.72585
[8]	eval-gamma-nloglik:11.95898	train-gamma-nloglik:11.95158
[9]	eval-gamma-nloglik:10.59402	train-gamma-nloglik:10.57332
type: <class 'xgboost.core.Booster'>


In [25]:
regressor = xgboost.XGBRegressor().fit(regression_X_train, regression_y_train)  # train using booster class
predictions = regressor.predict(regression_X_test)
print("type: {0}".format(type(regressor)))
print("score: {0}".format(regressor.score(regression_X_test, regression_y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.sklearn.XGBRegressor\n")
pickle.dump(regressor, open("{0}/regression_insurance_xgboost.sklearn.XGBRegressor.sav".format(save_dir), "wb+"))

type: <class 'xgboost.sklearn.XGBRegressor'>
score: 0.7735655792746694


In [26]:
regressor = xgboost.XGBRFRegressor().fit(regression_X_train, regression_y_train)  # train using booster class
predictions = regressor.predict(regression_X_test)
print("type: {0}".format(type(regressor)))
print("score: {0}".format(regressor.score(regression_X_test, regression_y_test)))

with open("algorithm.txt", "a+") as f:
    f.write("xgboost.sklearn.XGBRFRegressor\n")
pickle.dump(regressor, open("{0}/regression_insurance_xgboost.sklearn.XGBRFRegressor.sav".format(save_dir), "wb+"))

type: <class 'xgboost.sklearn.XGBRFRegressor'>
score: 0.8109099634953153
