In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import pickle
xgb.set_config(verbosity=1) #turn down info

from sklearn import tree
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt


from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.metrics import confusion_matrix
import itertools

## training set, validation set, testing set

In [2]:
train_set = pd.read_csv('train_set_V.csv')[:1000000]
valid_set = pd.read_csv('val_set_V.csv')[:10000]
test_set = pd.read_csv('test_set_V.csv')[:10000]

In [3]:
train_set.tail(20)

Unnamed: 0,EMPL_ID,AGE,HAS_AE,HAS_AW,HAS_IP,LOGICAL_FACTOR_1,LOGICAL_FACTOR_2,SEX_K,SEX_M,SEX_N,...,PKD_GROUP_90,PKD_GROUP_91,PKD_GROUP_93,PKD_GROUP_95,PKD_GROUP_96,PKD_GROUP_97,PKD_GROUP_98,PKD_GROUP_99,PKD_GROUP_na,TARGET
999980,232995,-0.595146,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
999981,26460,1.065559,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999982,260425,-0.580397,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
999983,242311,0.643426,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999984,243664,0.589164,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999985,37909,-0.744019,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999986,149169,-0.23284,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
999987,92415,-1.017,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
999988,239358,-1.15363,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
999989,239798,0.822631,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y_train = np.array(train_set['TARGET'])
x_train = np.array(train_set.drop(['TARGET'], axis=1))

y_val = np.array(valid_set['TARGET'])
x_val = np.array(valid_set.drop(['TARGET'], axis=1))

y_test = np.array(test_set['TARGET'])
x_test = np.array(test_set.drop(['TARGET'], axis=1))

In [None]:
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

# XGBClassifier using xgboost library

In [None]:
n_estimators =60
max_depth = 40

classifier = xgb.XGBClassifier(n_estimators=n_estimators, 
                               max_depth=max_depth, 
                               min_child_weight =0.1,
                               reg_alpha = 1e-2, 
                               learning_rate=1e-5)



classifier.fit(x_train, y_train)

In [None]:
output = classifier.predict(x_test)
target = y_test
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on testing set: {:.4f} %".format(accuracy*100))

In [None]:
classes = [0,1]
y_true = y_test
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_true, y_pred, normalize ='true')

plt.imshow(cm, interpolation='nearest', cmap='Blues')
plt.title("Confusion Matrix for XGBClassifier")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, round(cm[i, j],3), horizontalalignment="center", color="white" if cm[i, j] > 0.5 else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')

## Feature importance

In [None]:
feature_important = classifier.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

features = list(train_set.columns)
name_features = []
for k in keys:
    index = int(k[1:])
    name_features.append(features[index])


df_feature_important = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
df_feature_important['name_features']=name_features
df_feature_important.head(15)

In [None]:
xgb.plot_importance(classifier, importance_type='weight',max_num_features=10 )

In [None]:
feature_important = classifier.get_booster().get_score(importance_type='gain')
keys = list(feature_important.keys())
values = list(feature_important.values())

features = list(train_set.columns)
name_features = []
for k in keys:
    index = int(k[1:])
    name_features.append(features[index])


df_feature_important = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
df_feature_important['name_features']=name_features
df_feature_important.head(15)

In [None]:
xgb.plot_importance(classifier, importance_type='gain',max_num_features=10 )

## Save model

In [None]:
file_name = "xgb_model.pkl"

# save
pickle.dump(classifier, open(file_name, "wb"))

# load
xgb_model_loaded = pickle.load(open(file_name, "rb"))

# FINAL EVALUATION OF THE MODEL: XGBoost

In [None]:
xgb_model_loaded = pickle.load(open(file_name, "rb"))

output = xgb_model_loaded.predict(x_train)
target = y_train
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on testing set: {:.4f} %".format(accuracy*100))

output = xgb_model_loaded.predict(x_val)
target = y_val
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on testing set: {:.4f} %".format(accuracy*100))

output = xgb_model_loaded.predict(x_test)
target = y_test
accuracy = sum(output == target)/ len(target)
print("\nAccuracy on testing set: {:.4f} %".format(accuracy*100))

In [None]:
import sklearn

y_pred = xgb_model_loaded.predict(x_test)
y_true = y_test

accuracy = sum(y_pred == y_true)/ len(y_pred)
recall_score = sklearn.metrics.recall_score( y_true, y_pred) 
precision_score = sklearn.metrics.precision_score(y_true, y_pred) 

print("\nAccuracy on testing set: {:.4f} %".format(accuracy*100))
print("Presicion on testing set:", precision_score)
print("Recall on testing set:", recall_score)