In [1]:
# Importing the necessary dependency libraries
import xgboost as xgb
from xgboost import XGBClassifier 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc, roc_auc_score
import imblearn
from imblearn.over_sampling import SMOTE

In [2]:
# Define model performance test functions
def performance_xgboost(xtrainup,ytrainup,xtrain,ytrain):
    clf.fit(xtrainup, ytrainup)
    # AUC
    auc_scores = roc_auc_score(ytrain, clf.predict_proba(xtrain)[:,1])
    print('auc = ', "%.3f"%auc_scores)
    # accuracy
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(ytrain, clf.predict(xtrain))*100
    print('accuracy = ', "%.1f"%accuracy)
    print('------------------------')
    # 混淆矩阵
    import sklearn.metrics as sm
    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(ytrain, clf.predict(xtrain), labels=None, sample_weight=None)
    print('混淆矩阵为：')
    print(matrix)
    (tn,fp,fn,tp) = matrix.ravel()
    print('tn=',tn)
    print('fp=',fp)
    print('fn=',fn)
    print('tp=',tp)
    print('------------------------')
    sensitivity = (tp/(tp+fn))*100
    specificity = (tn/(fp+tn))*100
    PPV=tp/(tp+fp)*100
    NPV=tn/(fn+tn)*100
    print(f'PPV = {"%.1f"%PPV}\n({tp}/{(tp+fp)})')
    print(f'NPV = {"%.1f"%NPV}\n({tn}/{(fn+tn)})')
    print(f'sensitivity = {"%.1f"%sensitivity}\n({tp}/{(tp+fn)})')
    print(f'specificity = {"%.1f"%specificity}\n({tn}/{(fp+tn)})')

In [3]:
# Reading data
train = pd.read_csv('F:/Onedrive/JIMMY/Python/Notebook/data_set/inuse/train.csv',encoding='gbk')
test = pd.read_csv('F:/Onedrive/JIMMY/Python/Notebook/data_set/inuse/test.csv',encoding='gbk')
validation = pd.read_csv('F:/Onedrive/JIMMY/Python/Notebook/data_set/inuse/validation_new04.csv',encoding='gbk')

xtrain = train.iloc[:,1:14]
xtest = test.iloc[:,1:14]
xv = validation.iloc[:,1:14]
ytrain = train.iloc[:,-1]
ytest = test.iloc[:,-1]
yv = validation.iloc[:,-1]

# Up-sampling processing
sm = SMOTE(sampling_strategy={1: 2200},random_state=100) 
xtrainup,ytrainup = sm.fit_resample(xtrain,ytrain)



In [4]:
# Model Instantiation
clf = XGBClassifier(use_label_encoder=False
                   ,learning_rate =0.01  
                   ,n_estimators=250
                   ,max_depth=3
                   ,min_child_weight=4
                   ,gamma=1
                   ,subsample=0.8
                   ,colsample_bytree=0.9
                   ,objective= 'binary:logistic'
                   ,scale_pos_weight=0.9
                   ,nthread=4
                   ,reg_alpha=0.17
                   ,tree_method='auto'
                   ,seed=27
                   ,n_jobs=-1) 
clf.fit(xtrainup, ytrainup)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=-1, nthread=4, num_parallel_tree=1,
              random_state=27, reg_alpha=0.17, reg_lambda=1,
              scale_pos_weight=0.9, seed=27, subsample=0.8, tree_method='auto',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [5]:
# Testing the model on the training set
performance_xgboost(xtrainup,ytrainup,xtrain,ytrain)

auc =  0.935
accuracy =  83.3
------------------------
混淆矩阵为：
[[1261  321]
 [  42  544]]
tn= 1261
fp= 321
fn= 42
tp= 544
------------------------
PPV = 62.9
(544/865)
NPV = 96.8
(1261/1303)
sensitivity = 92.8
(544/586)
specificity = 79.7
(1261/1582)


In [6]:
# Testing the model on the internal validation set
performance_xgboost(xtrainup,ytrainup,xtest,ytest)

auc =  0.908
accuracy =  81.8
------------------------
混淆矩阵为：
[[522 144]
 [ 25 239]]
tn= 522
fp= 144
fn= 25
tp= 239
------------------------
PPV = 62.4
(239/383)
NPV = 95.4
(522/547)
sensitivity = 90.5
(239/264)
specificity = 78.4
(522/666)


In [7]:
# Testing the model on the external validation set
performance_xgboost(xtrainup,ytrainup,xv,yv)

auc =  0.863
accuracy =  75.8
------------------------
混淆矩阵为：
[[466 166]
 [ 44 191]]
tn= 466
fp= 166
fn= 44
tp= 191
------------------------
PPV = 53.5
(191/357)
NPV = 91.4
(466/510)
sensitivity = 81.3
(191/235)
specificity = 73.7
(466/632)
