# Extreme Gradient Boosting

In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [13]:
df['target'].unique()

array([0., 1., 2.])

In [1]:
#pip install xgboost

In [5]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=2)

In [8]:
xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', max_depth=6,
                    learning_rate=0.1, n_estimators=100, random_state=2, n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))



Score: 0.9736842105263158


Brief description of the preceding hyperparameter are as follow:

* booster = 'gbtree': The booster is the base learner. The machine learning model that is constructed during every round of boosting. The default learner is a gradient boosted tree, however it is possible to work with other base learners.
* objective = 'multi:softprob': Chooses the objective function to be optimized.
* max_depth: Depth of the base tree
* learning_rate
* n_estimator

In [5]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

X,y = datasets.load_diabetes(return_X_y=True)

xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror',
                   learning_rate=0.1, n_estimators=100, random_state=2, n_jobs=-1)

scores = cross_val_score(xgb, X, y, scoring='neg_mean_squared_error', cv=5)

# Take square root of the scores
rmse = np.sqrt(-scores)

# Display accuracy
print('RMSE:', np.round(rmse, 3))

# Display mean score
print('RMSE mean: %0.3f' % (rmse.mean()))

RMSE: [63.033 59.689 64.538 63.699 64.661]
RMSE mean: 63.124


In [8]:
pd.DataFrame(y).describe()

Unnamed: 0,0
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


In [3]:
df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [6]:
del df['Weight']
del df['KaggleSet']
df = df.rename(columns={"KaggleWeight": "Weight"})
label_col = df['Label']
del df['Label']
df['Label'] = label_col
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [7]:
df['Label'].replace(('s', 'b'), (1, 0), inplace=True)

In [8]:
X = df.iloc[:,1:31]
y = df.iloc[:,-1]

In [9]:
df['test_Weight'] = df['Weight'] * 550000 / len(y)

In [10]:
s = np.sum(df[df['Label']==1]['test_Weight'])
b = np.sum(df[df['Label']==0]['test_Weight'])

In [11]:
#!/usr/bin/python
# this is the example script to use xgboost to train
import xgboost as xgb

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgb_clf = xgb.DMatrix(X, y, missing = -999.0, weight=df['test_Weight'],)

# setup parameters for xgboost
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = b/s
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'

# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())+[('eval_metric', 'ams@0.15')]

watchlist = [ (xgb_clf,'train') ]

# boost 120 trees
num_round = 120

print ('loading data end, start to boost trees')
bst = xgb.train( plst, xgb_clf, num_round, watchlist )
bst.save_model('higgs.model')
print ('finish training')

loading data end, start to boost trees
[0]	train-auc:0.91088	train-ams@0.15:3.69846
[1]	train-auc:0.91532	train-ams@0.15:3.99148
[2]	train-auc:0.91797	train-ams@0.15:4.09947
[3]	train-auc:0.91950	train-ams@0.15:4.25035
[4]	train-auc:0.92047	train-ams@0.15:4.24004
[5]	train-auc:0.92130	train-ams@0.15:4.24742
[6]	train-auc:0.92223	train-ams@0.15:4.33231
[7]	train-auc:0.92335	train-ams@0.15:4.35929
[8]	train-auc:0.92386	train-ams@0.15:4.37476
[9]	train-auc:0.92424	train-ams@0.15:4.36366
[10]	train-auc:0.92481	train-ams@0.15:4.36513
[11]	train-auc:0.92540	train-ams@0.15:4.40817
[12]	train-auc:0.92581	train-ams@0.15:4.41476
[13]	train-auc:0.92641	train-ams@0.15:4.46194
[14]	train-auc:0.92684	train-ams@0.15:4.43823
[15]	train-auc:0.92734	train-ams@0.15:4.47224
[16]	train-auc:0.92801	train-ams@0.15:4.52076
[17]	train-auc:0.92844	train-ams@0.15:4.57117
[18]	train-auc:0.92898	train-ams@0.15:4.60248
[19]	train-auc:0.92933	train-ams@0.15:4.65436
[20]	train-auc:0.92984	train-ams@0.15:4.69182
[21]	