In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn import tree

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
def calculate_fpr_tpr_tnr_f1score_accuracy(y_true, y_pred):
   
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    true_num = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            true_num += 1
    accuracy_value = true_num/len(y_true)
    return accuracy_value

In [5]:
file_path = ".\..\data\SERS.xlsx"


def read_excel(path):
    file = pd.ExcelFile(path)
    sheet_names = file.sheet_names
    df_1 = pd.read_excel(path, str(sheet_names[0]), header=None)
    df_2 = pd.read_excel(path, str(sheet_names[1]), header=None)
    df_3 = pd.read_excel(path, str(sheet_names[2]), header=None)
    data_1 = df_1.values
    data_2 = df_2.values
    data_3 = df_3.values

    return data_1, data_2, data_3

In [6]:
def X_data_process(data):
    matrix = data.T
    feature_matrix = np.delete(matrix, 0, axis=0)
    sample_num = len(feature_matrix)
    print(sample_num)
    return feature_matrix, sample_num

In [7]:
data_health_raw, data_nose_raw, data_stomac_raw = read_excel(file_path)      
data_health_raw_fea, data_health_raw_num = X_data_process(data_health_raw)
data_nose_raw_fea, data_nose_raw_num = X_data_process(data_nose_raw)
data_stomac_raw_fea, data_stomac_raw_num = X_data_process(data_stomac_raw)

data = np.append(data_health_raw_fea, data_nose_raw_fea, axis=0)
data = np.append(data, data_stomac_raw_fea, axis=0)


total_num = data_health_raw_num + data_nose_raw_num + data_stomac_raw_num
label = np.zeros([total_num])
for i in range(total_num):
    if i<data_health_raw_num:
        label[i] = 1
    elif i< data_health_raw_num+ data_nose_raw_num:
        label[i] = 2
    else:
        label[i] = 3

X = data
Y = label

"""shuffle"""
length = len(Y)
index = list(range(length))
np.random.shuffle(index)
X = X[index, :]
Y = Y[index]

X=pd.DataFrame(X)

33
49
32


In [47]:
"""random forest"""
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(random_state=0)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=True)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("RandomForest")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

RandomForest
[[28  0  5]
 [ 0 49  0]
 [ 6  3 23]]
accuracy_calss1: 0.8235294117647058
accuracy_calss2: 0.9423076923076923
accuracy_calss3: 0.8214285714285714
recall_calss1: 0.8484848484848485
recall_calss2: 1.0
recall_calss3: 0.71875
F1_macro: 0.8575948639640084
F1_micro: 0.8771929824561403
accuracy: 0.8771929824561403


In [8]:
"""random forest + randomizedsearchcv"""
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit

model_params={
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[100,200,300,400,500,600,700],
            'criterion' : ['gini', 'entropy'],
            'max_features' : ['auto', 'sqrt','log2'],
            'max_depth' : [10,50,100,150,200,250,300,350,400,450,500]
        }
    }
}

scores=[]
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)
for model_name, mp in model_params.items():
    random_cv=RandomizedSearchCV(mp['model'],mp['params'],cv=cv, return_train_score=False)
    random_cv.fit(X,Y)
    scores.append({
        'model':model_name,
        'best_score':random_cv.best_score_,
        'best param':random_cv.best_params_,
        'best estimator':random_cv.best_estimator_
    })

ds=pd.DataFrame(scores,columns=['model','best param'])
ds

def display_text_max_col_width(df, width):
    with pd.option_context('display.max_colwidth', width):
        print(df)

display_text_max_col_width(ds['best param'], 800)

0    {'n_estimators': 700, 'max_features': 'log2', 'max_depth': 50, 'criterion': 'gini'}
Name: best param, dtype: object


In [13]:
"""random forest + randomizedsearchcv"""
from sklearn.ensemble import RandomForestClassifier


rf_random=RandomForestClassifier(n_estimators=700,criterion='gini',max_features= 'log2', max_depth= 50)
clf = rf_random

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=True)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("random forest + randomizedsearchcv")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

random forest + randomizedsearchcv
[[29  0  4]
 [ 0 49  0]
 [ 5  2 25]]
accuracy_calss1: 0.8529411764705882
accuracy_calss2: 0.9607843137254902
accuracy_calss3: 0.8620689655172413
recall_calss1: 0.8787878787878788
recall_calss2: 1.0
recall_calss3: 0.78125
F1_macro: 0.8884479243128619
F1_micro: 0.9035087719298246
accuracy: 0.9035087719298246


In [11]:
"""random forest + gridsearchCV"""
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit


model_params1={
    'random_forest1':{
        'model1':RandomForestClassifier(),
        'params1':{
            'criterion' : [random_cv.best_params_['criterion']],
            'max_features' : [random_cv.best_params_['max_features']],
            'max_depth' : [random_cv.best_params_['max_depth']-20,
                           random_cv.best_params_['max_depth']-10,
                           random_cv.best_params_['max_depth'],
                           random_cv.best_params_['max_depth']+10,
                           random_cv.best_params_['max_depth']+20],
            'n_estimators':[random_cv.best_params_['n_estimators']-75,
                            random_cv.best_params_['n_estimators']-50,
                            random_cv.best_params_['n_estimators']-25,
                            random_cv.best_params_['n_estimators'],
                            random_cv.best_params_['n_estimators']+25,
                            random_cv.best_params_['n_estimators']+50,
                            random_cv.best_params_['n_estimators']+75]
        }
    }
}



from sklearn.model_selection import GridSearchCV
scores1=[]
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)
for model_name, mp in model_params1.items():
    grid_cv=GridSearchCV(mp['model1'],mp['params1'],cv=cv, return_train_score=False)
    grid_cv.fit(X,Y)
    scores1.append({
        'model1':model_name,
        'best_score1':grid_cv.best_score_,
        'best param1':grid_cv.best_params_,
        'best estimator1':grid_cv.best_estimator_
    })

ds1=pd.DataFrame(scores1,columns=['model1','best param1'])
ds1

def display_text_max_col_width(df, width):
    with pd.option_context('display.max_colwidth', width):
        print(df)

display_text_max_col_width(ds1['best param1'], 800)

0    {'criterion': 'gini', 'max_depth': 40, 'max_features': 'log2', 'n_estimators': 725}
Name: best param1, dtype: object


In [12]:
"""random forest + Gridsearchcv"""
from sklearn.ensemble import RandomForestClassifier


rf_random=RandomForestClassifier(n_estimators=725,criterion='gini',max_features= 'log2', max_depth= 40)
clf = rf_random

skf = StratifiedKFold(n_splits=10,random_state=9, shuffle=True)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("random forest + Gridsearchcv")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

random forest + Gridsearchcv
[[30  0  3]
 [ 0 49  0]
 [ 4  1 27]]
accuracy_calss1: 0.8823529411764706
accuracy_calss2: 0.98
accuracy_calss3: 0.9
recall_calss1: 0.9090909090909091
recall_calss2: 1.0
recall_calss3: 0.84375
F1_macro: 0.9187963732980585
F1_micro: 0.9298245614035088
accuracy: 0.9298245614035088


In [30]:
"""ExtraTrees"""
from sklearn.ensemble import ExtraTreesClassifier


rf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("ExtraTrees")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

ExtraTrees
[[31  0  2]
 [ 0 49  0]
 [ 3  2 27]]
accuracy_calss1: 0.9117647058823529
accuracy_calss2: 0.9607843137254902
accuracy_calss3: 0.9310344827586207
recall_calss1: 0.9393939393939394
recall_calss2: 1.0
recall_calss3: 0.84375
F1_macro: 0.9302063453225675
F1_micro: 0.9385964912280702
accuracy: 0.9385964912280702


In [37]:
"""XGBoost"""
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

rf = DecisionTreeClassifier(random_state=0)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("DecisionTree")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

DecisionTree
[[25  3  5]
 [ 2 43  4]
 [ 3  2 27]]
accuracy_calss1: 0.8333333333333334
accuracy_calss2: 0.8958333333333334
accuracy_calss3: 0.75
recall_calss1: 0.7575757575757576
recall_calss2: 0.8775510204081632
recall_calss3: 0.84375
F1_macro: 0.8247887929513157
F1_micro: 0.8333333333333334
accuracy: 0.8333333333333334


In [38]:
from xgboost.sklearn import XGBClassifier

rf = XGBClassifier(n_jobs=-1)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("XGBoost")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

XGBoost
[[26  0  7]
 [ 1 48  0]
 [ 5  1 26]]
accuracy_calss1: 0.8125
accuracy_calss2: 0.9795918367346939
accuracy_calss3: 0.7878787878787878
recall_calss1: 0.7878787878787878
recall_calss2: 0.9795918367346939
recall_calss3: 0.8125
F1_macro: 0.8598639455782312
F1_micro: 0.8771929824561403
accuracy: 0.8771929824561403


In [39]:
"""GBDT"""
from sklearn.ensemble import GradientBoostingClassifier

rf = GradientBoostingClassifier(random_state=100)
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("GBDT")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

GBDT
[[27  2  4]
 [ 1 48  0]
 [ 5  0 27]]
accuracy_calss1: 0.8181818181818182
accuracy_calss2: 0.96
accuracy_calss3: 0.8709677419354839
recall_calss1: 0.8181818181818182
recall_calss2: 0.9795918367346939
recall_calss3: 0.84375
F1_macro: 0.8816738816738817
F1_micro: 0.8947368421052632
accuracy: 0.8947368421052632


In [46]:
"""LGB"""
from lightgbm import LGBMClassifier

rf = LGBMClassifier()
clf = rf

skf = StratifiedKFold(n_splits=10,random_state=None, shuffle=False)
tmp=pd.DataFrame(columns=['no','pred'])
tmp['no']=[i for i in range(len(X))]
for i, (trn_idx, val_idx) in enumerate(skf.split(X,Y)):
    trn_x, trn_y = X.iloc[trn_idx].reset_index(drop=True), Y[trn_idx]
    val_x, val_y = X.iloc[val_idx].reset_index(drop=True), Y[val_idx]
    clf = clf.fit(trn_x,trn_y)
    pred_y=clf.predict(val_x)
    tmp.loc[tmp['no'].isin(val_idx),'pred']=pred_y

Mix_matrix = confusion_matrix(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy_class_1 = Mix_matrix[0][0] / (Mix_matrix[0][0] + Mix_matrix[1][0] + Mix_matrix[2][0])
accuracy_class_2 = Mix_matrix[1][1] / (Mix_matrix[0][1] + Mix_matrix[1][1] + Mix_matrix[2][1]) 
accuracy_class_3 = Mix_matrix[2][2] / (Mix_matrix[0][2] + Mix_matrix[1][2] + Mix_matrix[2][2])



recall_1 = Mix_matrix[0][0] / Mix_matrix[0].sum()
recall_2 = Mix_matrix[1][1] / Mix_matrix[1].sum()
recall_3 = Mix_matrix[2][2] / Mix_matrix[2].sum()

print("LGB")

print(Mix_matrix)
print("accuracy_calss1:", accuracy_class_1)
print("accuracy_calss2:", accuracy_class_2)
print("accuracy_calss3:", accuracy_class_3)


print("recall_calss1:", recall_1)
print("recall_calss2:", recall_2)
print("recall_calss3:", recall_3)


f1_macro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='macro')
f1_micro = f1_score(np.array(Y), np.array(tmp['pred'].tolist()), average='micro')

print("F1_macro:", f1_macro)
print("F1_micro:", f1_micro)


# (fpr, tpr, tnr, f1Score, accuracy) = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
accuracy = calculate_fpr_tpr_tnr_f1score_accuracy(np.array(Y), np.array(tmp['pred'].tolist()))
#print(clf)
print("accuracy:", accuracy)

LGB
[[28  0  5]
 [ 0 49  0]
 [ 4  1 27]]
accuracy_calss1: 0.875
accuracy_calss2: 0.98
accuracy_calss3: 0.84375
recall_calss1: 0.8484848484848485
recall_calss2: 1.0
recall_calss3: 0.84375
F1_macro: 0.8983958171458172
F1_micro: 0.9122807017543859
accuracy: 0.9122807017543859
