In [1]:
# Importing the necessary dependency libraries
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc, roc_auc_score
import shap
import imblearn
from imblearn.over_sampling import SMOTE

In [2]:
# Define model performance test functions
def performance_svm(xtrainup,ytrainup,xtrain,ytrain):
    clf.fit(xtrainup,ytrainup)
    # AUC
    auc_scores = roc_auc_score(ytrain, clf.decision_function(xtrain))
    print('auc = ', "%.3f"%auc_scores)
    # accuracy
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(ytrain, clf.predict(xtrain))*100
    print('accuracy = ', "%.1f"%accuracy)
    print('------------------------')
    # 混淆矩阵
    import sklearn.metrics as sm
    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(ytrain, clf.predict(xtrain), labels=None, sample_weight=None)
    print('混淆矩阵为：')
    print(matrix)
    (tn,fp,fn,tp) = matrix.ravel()
    print('tn=',tn)
    print('fp=',fp)
    print('fn=',fn)
    print('tp=',tp)
    print('------------------------')
    sensitivity = (tp/(tp+fn))*100
    specificity = (tn/(fp+tn))*100
    PPV=tp/(tp+fp)*100
    NPV=tn/(fn+tn)*100
    print(f'PPV = {"%.1f"%PPV}\n({tp}/{(tp+fp)})')
    print(f'NPV = {"%.1f"%NPV}\n({tn}/{(fn+tn)})')
    print(f'sensitivity = {"%.1f"%sensitivity}\n({tp}/{(tp+fn)})')
    print(f'specificity = {"%.1f"%specificity}\n({tn}/{(fp+tn)})')

In [3]:
# Reading data
train = pd.read_csv('F:/Onedrive/JIMMY/Python/Notebook/data_set/inuse/train.csv',encoding='gbk')
test = pd.read_csv('F:/Onedrive/JIMMY/Python/Notebook/data_set/inuse/test.csv',encoding='gbk')
validation = pd.read_csv('F:/Onedrive/JIMMY/Python/Notebook/data_set/inuse/validation_new04.csv',encoding='gbk')

xtrain = train.iloc[:,9:14]
xtest = test.iloc[:,9:14]
xv = validation.iloc[:,9:14]
ytrain = train.iloc[:,-1]
ytest = test.iloc[:,-1]
yv = validation.iloc[:,-1]

# Up-sampling processing
sm = SMOTE(sampling_strategy={1: 3000},random_state=100) 
xtrainup,ytrainup = sm.fit_resample(xtrain,ytrain)

After over-sampling, the number of samples (3000) in class 1 will be larger than the number of samples in the majority class (class #0 -> 1582)


In [4]:
# Model Instantiation
clf = SVC(kernel='rbf'
           ,gamma= 0.01
          ,C=2
          ,cache_size=10000)

In [5]:
# Testing the model on the training set
performance_svm(xtrainup,ytrainup,xtrain,ytrain)

auc =  0.889
accuracy =  82.1
------------------------
混淆矩阵为：
[[1254  328]
 [  59  527]]
tn= 1254
fp= 328
fn= 59
tp= 527
------------------------
PPV = 61.6
(527/855)
NPV = 95.5
(1254/1313)
sensitivity = 89.9
(527/586)
specificity = 79.3
(1254/1582)


In [6]:
# Testing the model on the internal validation set
performance_svm(xtrainup,ytrainup,xtest,ytest)

auc =  0.873
accuracy =  82.7
------------------------
混淆矩阵为：
[[529 137]
 [ 24 240]]
tn= 529
fp= 137
fn= 24
tp= 240
------------------------
PPV = 63.7
(240/377)
NPV = 95.7
(529/553)
sensitivity = 90.9
(240/264)
specificity = 79.4
(529/666)


In [7]:
# Testing the model on the external validation set
performance_svm(xtrainup,ytrainup,xv,yv)

auc =  0.878
accuracy =  70.6
------------------------
混淆矩阵为：
[[403 229]
 [ 26 209]]
tn= 403
fp= 229
fn= 26
tp= 209
------------------------
PPV = 47.7
(209/438)
NPV = 93.9
(403/429)
sensitivity = 88.9
(209/235)
specificity = 63.8
(403/632)
