In [13]:
# Python Import
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split




In [14]:
#Generate an artificial classification dataset

X,y = make_classification(
    n_samples=200,
    n_features=5,
    n_informative=3,
    n_classes=2,
    weights=[.9, .1],
    shuffle=True,
    random_state=123

)

In [33]:
# Check imbalance

print('Positive: {}'.format(list(y==1).count(True)))
print('Negative: {}'.format(list(y==0).count(True)))


Positive: 20
Negative: 180


In [35]:
# Generate the Train/Validation split

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.33, stratify=y, random_state=123)

In [49]:
# XGBoost data structure

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# XGBoost parameters

params = {
    'objective':'binary:logistic',
    'max_depth':1,
    'silent':1,
    'eta':1
}

num_rounds = 15


In [50]:
# XGBoost training

bst = xgb.train(params, dtrain, num_rounds)
y_test_preds = (bst.predict(dtest)>0.5).astype('int')

In [51]:
# Generate the confusion Matrix

pd.crosstab(
    pd.Series(y_test, name='Actual'),
    pd.Series(y_test_preds, name='Predicted'),
    margins=True
)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,58,1,59
1,5,2,7
All,63,3,66


In [52]:
# Calculate the score using different metrics 

print ('Accuracy: {0:.2f}'.format(accuracy_score(y_test,y_test_preds)))
print ('Precision: {0:.2f}'.format(precision_score(y_test,y_test_preds)))
print ('Recall: {0:.2f}'.format(recall_score(y_test,y_test_preds)))



Accuracy: 0.91
Precision: 0.67
Recall: 0.29


In [53]:
# Manage the imbalance by putting 5 time more importance to positive class than negative

weights = np.zeros(len(y_train))
weights[y_train==0]=1
weights[y_train==1]=5

# Remake the XGBoost Structures using the weights

dtrain = xgb.DMatrix(X_train, label=y_train, weight = weights)
dtest = xgb.DMatrix(X_test)

# Rerun the training

bst = xgb.train(params, dtrain, num_rounds)
y_test_preds = (bst.predict(dtest)>0.5).astype('int')


In [54]:
# Check again the confusion Matrix

pd.crosstab(
    pd.Series(y_test, name='Actual'),
    pd.Series(y_test_preds, name='Predicted'),
    margins=True
)


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,57,2,59
1,4,3,7
All,61,5,66


In [55]:
# Calculate the score using different metrics 

print ('Accuracy: {0:.2f}'.format(accuracy_score(y_test,y_test_preds)))
print ('Precision: {0:.2f}'.format(precision_score(y_test,y_test_preds)))
print ('Recall: {0:.2f}'.format(recall_score(y_test,y_test_preds)))



Accuracy: 0.91
Precision: 0.60
Recall: 0.43


In [56]:
# Now it's possible to automate the process of weighting like that

# First regenerate a clean XGBoost structure

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# And calculate the ration between both classes and assign it to a parameter

train_labels = dtrain.get_label()
ratio = float(np.sum(train_labels == 0)) / np.sum(train_labels==1)

# And set a new hyper parames to XGBoost
params['scale_pos_weight'] = ratio


In [57]:
#And redo training and evaluation


bst = xgb.train(params, dtrain, num_rounds)
y_test_preds = (bst.predict(dtest)>0.5).astype('int')

In [58]:
# Check again the confusion Matrix

pd.crosstab(
    pd.Series(y_test, name='Actual'),
    pd.Series(y_test_preds, name='Predicted'),
    margins=True
)


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,56,3,59
1,4,3,7
All,60,6,66


In [59]:
# Calculate the score using different metrics 

print ('Accuracy: {0:.2f}'.format(accuracy_score(y_test,y_test_preds)))
print ('Precision: {0:.2f}'.format(precision_score(y_test,y_test_preds)))
print ('Recall: {0:.2f}'.format(recall_score(y_test,y_test_preds)))


Accuracy: 0.89
Precision: 0.50
Recall: 0.43
