In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn import tree

In [25]:
def calculate_fpr_tpr_tnr_f1score_accuracy(y_true, y_pred):
   
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    true_num = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            true_num += 1
    accuracy_value = true_num/len(y_true)
    return accuracy_value

In [26]:
file_path = ".\..\data\SERS.xlsx"


def read_excel(path):
    file = pd.ExcelFile(path)
    sheet_names = file.sheet_names
    df_1 = pd.read_excel(path, str(sheet_names[0]), header=None)
    df_2 = pd.read_excel(path, str(sheet_names[1]), header=None)
    df_3 = pd.read_excel(path, str(sheet_names[2]), header=None)
    data_1 = df_1.values
    data_2 = df_2.values
    data_3 = df_3.values

    return data_1, data_2, data_3

In [27]:
def X_data_process(data):
    matrix = data.T
    feature_matrix = np.delete(matrix, 0, axis=0)
    sample_num = len(feature_matrix)
    print(sample_num)
    return feature_matrix, sample_num

In [28]:
data_health_raw, data_nose_raw, data_stomac_raw = read_excel(file_path)      
data_health_raw_fea, data_health_raw_num = X_data_process(data_health_raw)
data_nose_raw_fea, data_nose_raw_num = X_data_process(data_nose_raw)
data_stomac_raw_fea, data_stomac_raw_num = X_data_process(data_stomac_raw)

data = np.append(data_health_raw_fea, data_nose_raw_fea, axis=0)
data = np.append(data, data_stomac_raw_fea, axis=0)


total_num = data_health_raw_num + data_nose_raw_num + data_stomac_raw_num
label = np.zeros([total_num])
for i in range(total_num):
    if i<data_health_raw_num:
        label[i] = 1
    elif i< data_health_raw_num+ data_nose_raw_num:
        label[i] = 2
    else:
        label[i] = 3

X = data
Y = label

"""shuffle"""
length = len(Y)
index = list(range(length))
np.random.shuffle(index)
X = X[index, :]
Y = Y[index]

X=pd.DataFrame(X)

33
49
32


In [29]:
"""random forest"""
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(random_state=0)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("RandomForest")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

RandomForest
[[32  1  0]
 [ 0 49  0]
 [ 7  2 23]]
accuracy_calss1: 0.8205128205128205
accuracy_calss2: 0.9423076923076923
accuracy_calss3: 1.0
recall_calss1: 0.9696969696969697
recall_calss2: 1.0
recall_calss3: 0.71875
F1_macro: 0.8985165183184985
F1_micro: 0.9122807017543859
accuracy: 0.9122807017543859


In [30]:
"""ExtraTrees"""
from sklearn.ensemble import ExtraTreesClassifier


rf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("ExtraTrees")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

ExtraTrees
[[31  0  2]
 [ 0 49  0]
 [ 3  2 27]]
accuracy_calss1: 0.9117647058823529
accuracy_calss2: 0.9607843137254902
accuracy_calss3: 0.9310344827586207
recall_calss1: 0.9393939393939394
recall_calss2: 1.0
recall_calss3: 0.84375
F1_macro: 0.9302063453225675
F1_micro: 0.9385964912280702
accuracy: 0.9385964912280702


In [37]:
"""XGBoost"""
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

rf = DecisionTreeClassifier(random_state=0)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("DecisionTree")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

DecisionTree
[[25  3  5]
 [ 2 43  4]
 [ 3  2 27]]
accuracy_calss1: 0.8333333333333334
accuracy_calss2: 0.8958333333333334
accuracy_calss3: 0.75
recall_calss1: 0.7575757575757576
recall_calss2: 0.8775510204081632
recall_calss3: 0.84375
F1_macro: 0.8247887929513157
F1_micro: 0.8333333333333334
accuracy: 0.8333333333333334


In [38]:
from xgboost.sklearn import XGBClassifier

rf = XGBClassifier(n_jobs=-1)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("XGBoost")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

XGBoost
[[26  0  7]
 [ 1 48  0]
 [ 5  1 26]]
accuracy_calss1: 0.8125
accuracy_calss2: 0.9795918367346939
accuracy_calss3: 0.7878787878787878
recall_calss1: 0.7878787878787878
recall_calss2: 0.9795918367346939
recall_calss3: 0.8125
F1_macro: 0.8598639455782312
F1_micro: 0.8771929824561403
accuracy: 0.8771929824561403


In [39]:
"""GBDT"""
from sklearn.ensemble import GradientBoostingClassifier

rf = GradientBoostingClassifier(random_state=100)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("GBDT")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

GBDT
[[27  2  4]
 [ 1 48  0]
 [ 5  0 27]]
accuracy_calss1: 0.8181818181818182
accuracy_calss2: 0.96
accuracy_calss3: 0.8709677419354839
recall_calss1: 0.8181818181818182
recall_calss2: 0.9795918367346939
recall_calss3: 0.84375
F1_macro: 0.8816738816738817
F1_micro: 0.8947368421052632
accuracy: 0.8947368421052632


In [46]:
"""LGB"""
from lightgbm import LGBMClassifier

rf = LGBMClassifier()
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("LGB")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

LGB
[[28  0  5]
 [ 0 49  0]
 [ 4  1 27]]
accuracy_calss1: 0.875
accuracy_calss2: 0.98
accuracy_calss3: 0.84375
recall_calss1: 0.8484848484848485
recall_calss2: 1.0
recall_calss3: 0.84375
F1_macro: 0.8983958171458172
F1_micro: 0.9122807017543859
accuracy: 0.9122807017543859
