In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("../../2-数据转换/data_nor.csv")
data_columns = pd.read_csv("../../3-特征选取/data_columns.csv")
columns = data_columns.columns.values
columns = np.append(columns, "risk")
data = data[columns]
print(data)

          city   latitude   longitude                 attacktype1_txt  \
0        Cairo  37.005105  -89.176269                   Armed Assault   
1      Oakland  37.791927 -122.225906               Bombing/Explosion   
2      Madison  43.076592  -89.412488  Facility/Infrastructure Attack   
3      Madison  43.072950  -89.386694  Facility/Infrastructure Attack   
4       Denver  39.758968 -104.876305  Facility/Infrastructure Attack   
...        ...        ...         ...                             ...   
72545     Aden  12.849085   45.037275               Bombing/Explosion   
72546    Bheri  28.709444   82.163611  Facility/Infrastructure Attack   
72547    Sabaa  15.305307   43.019490               Bombing/Explosion   
72548    Kabul  34.523842   69.140304                   Armed Assault   
72549  Wichita  37.688889  -97.336111                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1  

In [3]:
# LabelEncoder
encoder = LabelEncoder()
encoder.fit(list(data["city"].values))
data["city"] = encoder.transform(list(data["city"].values))

# one-hot
category_columns = [ col for col in data.columns if data[col].dtype == 'object' ]
data = pd.get_dummies(data, columns=category_columns)
print(data)

        city   latitude   longitude  date  risk  \
0       4035  37.005105  -89.176269     4     1   
1      15269  37.791927 -122.225906     5     1   
2      12390  43.076592  -89.412488     5     1   
3      12390  43.072950  -89.386694     6     1   
4       5528  39.758968 -104.876305     2     1   
...      ...        ...         ...   ...   ...   
72545    219  12.849085   45.037275     4     1   
72546   3154  28.709444   82.163611     4     1   
72547  17467  15.305307   43.019490     4     1   
72548   9328  34.523842   69.140304     4     2   
72549  21608  37.688889  -97.336111     4     2   

       attacktype1_txt_Armed Assault  attacktype1_txt_Assassination  \
0                                  1                              0   
1                                  0                              0   
2                                  0                              0   
3                                  0                              0   
4                               

In [4]:
data.to_csv('data_stand.csv', index=False, encoding='utf_8_sig')

In [5]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [6]:
#LR
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=5)
splits = kf.split(X, y)
# next(iter(splits))

lr_roc_scores = []
lr_precision_scores = []
lr_recall_scores = []
lr_f1_scores = []
lr_acc_scores = []
lr_feature_importances = pd.DataFrame(index=None)
lr_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    LR = LogisticRegression(random_state=0)
    LR.fit(X_train, y_train)
#     lr_feature_importances[f'fold_{k+1}'] = LR.feature_importance
    y_pred_prob =LR.predict_proba(X_test)
    y_pred=LR.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    lr_f1_scores.append(f1)
    lr_roc_scores.append(roc_auc)
    lr_acc_scores.append(acc)
    lr_recall_scores.append(recall)
    lr_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(lr_roc_scores)}')
print(f'average acc_score: {np.mean(lr_acc_scores)}')
print(f'average f1_score: {np.mean(lr_f1_scores)}')
print(f'average recall_score: {np.mean(lr_recall_scores)}')
print(f'average precision_score: {np.mean(lr_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 56.30530110602686%
 ACC: 33.81805651274983%
 F1: 29.32667689594957%
 RECALL: 33.81805651274983%
 PRECISION: 42.16514144653311%
第 2 折

 Fold 2 | 
 AUC_ROC: 49.90674326246032%
 ACC: 35.665058580289454%
 F1: 30.275885812472996%
 RECALL: 35.665058580289454%
 PRECISION: 34.07116656450804%
第 3 折

 Fold 3 | 
 AUC_ROC: 51.30378192680912%
 ACC: 32.68090971743625%
 F1: 26.887686585437475%
 RECALL: 32.68090971743625%
 PRECISION: 30.65551707121469%
第 4 折

 Fold 4 | 
 AUC_ROC: 50.85659548279696%
 ACC: 29.745003445899382%
 F1: 24.734479163674386%
 RECALL: 29.745003445899382%
 PRECISION: 30.403370364203447%
第 5 折

 Fold 5 | 
 AUC_ROC: 53.92775840721469%
 ACC: 33.13576843556168%
 F1: 29.550401400855904%
 RECALL: 33.13576843556168%
 PRECISION: 29.37039351352086%
average roc score: 0.5246003603706159
average acc_score: 0.33008959338387317
average f1_score: 0.28155025971678066
average recall_score: 0.33008959338387317
average precision_score: 0.33333117791996025


In [7]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [8]:
# adboost
from sklearn.ensemble import AdaBoostClassifier as ad

Ad_roc_scores = []
Ad_acc_scores = []
Ad_f1_scores = []
Ad_recall_scores = []
Ad_precision_scores = []
Ad_feature_importances = pd.DataFrame(index=None)
Ad_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    Ad = ad(random_state=0)
    Ad.fit(X_train, y_train)
    Ad_feature_importances[f'fold_{k+1}'] = Ad.feature_importances_
    y_pred_prob = Ad.predict_proba(X_test)
    y_pred = Ad.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    Ad_f1_scores.append(f1)
    Ad_roc_scores.append(roc_auc)
    Ad_acc_scores.append(acc)
    Ad_recall_scores.append(recall)
    Ad_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(Ad_roc_scores)}')
print(f'average acc_score: {np.mean(Ad_acc_scores)}')
print(f'average f1_score: {np.mean(Ad_f1_scores)}')
print(f'average recall_score: {np.mean(Ad_recall_scores)}')
print(f'average precision_score: {np.mean(Ad_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 78.79704970184193%
 ACC: 59.324603721571336%
 F1: 58.00040153660405%
 RECALL: 59.324603721571336%
 PRECISION: 58.1297283093992%
第 2 折

 Fold 2 | 
 AUC_ROC: 79.48432745145978%
 ACC: 60.56512749827705%
 F1: 58.95260613328297%
 RECALL: 60.56512749827705%
 PRECISION: 59.17356959235962%
第 3 折

 Fold 3 | 
 AUC_ROC: 79.18129689332255%
 ACC: 59.324603721571336%
 F1: 57.6935778843539%
 RECALL: 59.324603721571336%
 PRECISION: 58.41444236062445%
第 4 折

 Fold 4 | 
 AUC_ROC: 80.01608408016342%
 ACC: 61.29565816678153%
 F1: 59.582415582196546%
 RECALL: 61.29565816678153%
 PRECISION: 59.617362081664396%
第 5 折

 Fold 5 | 
 AUC_ROC: 79.49392396301458%
 ACC: 60.248104755341146%
 F1: 58.62316043020183%
 RECALL: 60.248104755341146%
 PRECISION: 58.67051641806753%
第 6 折

 Fold 6 | 
 AUC_ROC: 79.78412960086612%
 ACC: 60.57891109579601%
 F1: 59.42941147466908%
 RECALL: 60.57891109579601%
 PRECISION: 59.804366795429345%
第 7 折

 Fold 7 | 
 AUC_ROC: 79.48743926029978%
 ACC: 60.7443142

In [9]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [10]:
# DT

DT_roc_scores = []
DT_acc_scores = []
DT_f1_scores = []
DT_recall_scores = []
DT_precision_scores = []
DT_feature_importances = pd.DataFrame(index=None)
DT_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    DT = DecisionTreeClassifier(random_state=0)
    DT.fit(X_train, y_train)
    DT_feature_importances[f'fold_{k+1}'] = DT.feature_importances_
    y_pred_prob = DT.predict_proba(X_test)
    y_pred = DT.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    DT_f1_scores.append(f1)
    DT_roc_scores.append(roc_auc)
    DT_acc_scores.append(acc)
    DT_recall_scores.append(recall)
    DT_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(DT_roc_scores)}')
print(f'average acc_score: {np.mean(DT_acc_scores)}')
print(f'average f1_score: {np.mean(DT_f1_scores)}')
print(f'average recall_score: {np.mean(DT_recall_scores)}')
print(f'average precision_score: {np.mean(DT_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 72.05370042730459%
 ACC: 58.704341833218464%
 F1: 58.60231257232618%
 RECALL: 58.704341833218464%
 PRECISION: 58.52835277199318%
第 2 折

 Fold 2 | 
 AUC_ROC: 71.76346213981692%
 ACC: 58.3184011026878%
 F1: 58.22021449618611%
 RECALL: 58.3184011026878%
 PRECISION: 58.15672021252757%
第 3 折

 Fold 3 | 
 AUC_ROC: 72.36951584354604%
 ACC: 59.04893177119228%
 F1: 59.03929558369472%
 RECALL: 59.04893177119228%
 PRECISION: 59.033199581417136%
第 4 折

 Fold 4 | 
 AUC_ROC: 72.77215970412084%
 ACC: 59.51757408683667%
 F1: 59.55112241682703%
 RECALL: 59.51757408683667%
 PRECISION: 59.60165007117353%
第 5 折

 Fold 5 | 
 AUC_ROC: 72.22313878527605%
 ACC: 58.74569262577533%
 F1: 58.76036876685674%
 RECALL: 58.74569262577533%
 PRECISION: 58.77639937629466%
第 6 折

 Fold 6 | 
 AUC_ROC: 71.82800637469367%
 ACC: 58.22191592005513%
 F1: 58.260072060954336%
 RECALL: 58.22191592005513%
 PRECISION: 58.34378792019955%
第 7 折

 Fold 7 | 
 AUC_ROC: 71.42280846567174%
 ACC: 57.904893177119

In [11]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [12]:
# RF
from sklearn.ensemble import RandomForestClassifier

RF_roc_scores = []
RF_acc_scores = []
RF_f1_scores = []
RF_recall_scores = []
RF_precision_scores = []
RFC_feature_importances = pd.DataFrame(index=None)
RFC_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    RFC = RandomForestClassifier(random_state=0)
    RFC.fit(X_train, y_train)
    RFC_feature_importances[f'fold_{k+1}'] = RFC.feature_importances_
    y_pred_prob = RFC.predict_proba(X_test)
    y_pred = RFC.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    RF_f1_scores.append(f1)
    RF_roc_scores.append(roc_auc)
    RF_acc_scores.append(acc)
    RF_recall_scores.append(recall)
    RF_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(RF_roc_scores)}')
print(f'average acc_score: {np.mean(RF_acc_scores)}')
print(f'average f1_score: {np.mean(RF_f1_scores)}')
print(f'average recall_score: {np.mean(RF_recall_scores)}')
print(f'average precision_score: {np.mean(RF_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 85.50083590166722%
 ACC: 65.16884906960718%
 F1: 64.88282428016086%
 RECALL: 65.16884906960718%
 PRECISION: 64.80790058448807%
第 2 折

 Fold 2 | 
 AUC_ROC: 85.97019107454449%
 ACC: 65.9131633356306%
 F1: 65.5602754251751%
 RECALL: 65.9131633356306%
 PRECISION: 65.5214988397905%
第 3 折

 Fold 3 | 
 AUC_ROC: 85.94752153102387%
 ACC: 65.44452101998621%
 F1: 65.1275215878504%
 RECALL: 65.44452101998621%
 PRECISION: 65.07079283400452%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.29536185491679%
 ACC: 66.4920744314266%
 F1: 66.12336079999432%
 RECALL: 66.4920744314266%
 PRECISION: 66.0300765081067%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.17982465655038%
 ACC: 65.6237077877326%
 F1: 65.24097288067239%
 RECALL: 65.6237077877326%
 PRECISION: 65.12944282515201%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.16447210728472%
 ACC: 66.07856650585802%
 F1: 65.82504173636148%
 RECALL: 66.07856650585802%
 PRECISION: 65.78731174832608%
第 7 折

 Fold 7 | 
 AUC_ROC: 85.5454895501368%
 ACC: 65.15506547208821%
 F1: 64.8

In [13]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [14]:
# xgboost
import xgboost as xgb

XGBR_roc_scores = []
XGBR_acc_scores = []
XGBR_f1_scores = []
XGBR_recall_scores = []
XGBR_precision_scores = []
xg_feature_importances = pd.DataFrame(index=None)
xg_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    XGBR = xgb.XGBClassifier(random_state=0)
    XGBR.fit(X_train, y_train)
    xg_feature_importances[f'fold_{k+1}'] = XGBR.feature_importances_
    y_pred_prob = XGBR.predict_proba(X_test)
    y_pred = XGBR.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    XGBR_f1_scores.append(f1)
    XGBR_roc_scores.append(roc_auc)
    XGBR_acc_scores.append(acc)
    XGBR_recall_scores.append(recall)
    XGBR_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(XGBR_roc_scores)}')
print(f'average acc_score: {np.mean(XGBR_acc_scores)}')
print(f'average f1_score: {np.mean(XGBR_f1_scores)}')
print(f'average recall_score: {np.mean(XGBR_recall_scores)}')
print(f'average precision_score: {np.mean(XGBR_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 85.9660034851821%
 ACC: 65.6512749827705%
 F1: 65.05533148854914%
 RECALL: 65.6512749827705%
 PRECISION: 65.10362801664586%
第 2 折

 Fold 2 | 
 AUC_ROC: 86.41981614153251%
 ACC: 66.2301860785665%
 F1: 65.58960133870426%
 RECALL: 66.2301860785665%
 PRECISION: 65.77062849487295%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.4968880574082%
 ACC: 65.9407305306685%
 F1: 65.32715827923974%
 RECALL: 65.9407305306685%
 PRECISION: 65.38689687150413%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.63986433532818%
 ACC: 66.11991729841489%
 F1: 65.3653511896952%
 RECALL: 66.11991729841489%
 PRECISION: 65.42158023386597%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.34351049184012%
 ACC: 66.39558924879394%
 F1: 65.7908122703084%
 RECALL: 66.39558924879394%
 PRECISION: 65.93852618404676%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.57632838678055%
 ACC: 66.2026188835286%
 F1: 65.77699795215615%
 RECALL: 66.2026188835286%
 PRECISION: 65.94043649724505%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.17676780309257%
 ACC: 65.8855961405927%
 F1: 65.323

In [15]:
data = pd.read_csv("data_stand.csv")
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
data.to_csv('data_stand_lgb.csv', index=False, encoding='utf_8_sig')

In [16]:
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [17]:
#lightGbm
import lightgbm as lgb

lgb_roc_scores = []
lgb_acc_scores = []
lgb_f1_scores = []
lgb_recall_scores = []
lgb_precision_scores = []
lgb_feature_importances = pd.DataFrame(index=None)
lgb_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    
    LGB = lgb.LGBMClassifier(random_state=0)
    LGB.fit(X_train, y_train)
    lgb_feature_importances[f'fold_{k+1}'] = LGB.feature_importances_
    y_pred_prob = LGB.predict_proba(X_test)
    y_pred = LGB.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    lgb_f1_scores.append(f1)
    lgb_roc_scores.append(roc_auc)
    lgb_acc_scores.append(acc)
    lgb_recall_scores.append(recall)
    lgb_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(lgb_roc_scores)}')
print(f'average acc_score: {np.mean(lgb_acc_scores)}')
print(f'average f1_score: {np.mean(lgb_f1_scores)}')
print(f'average recall_score: {np.mean(lgb_recall_scores)}')
print(f'average precision_score: {np.mean(lgb_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 86.04046597281364%
 ACC: 65.3342522398346%
 F1: 64.6943047685103%
 RECALL: 65.3342522398346%
 PRECISION: 64.7973148370249%
第 2 折

 Fold 2 | 
 AUC_ROC: 86.39618750993998%
 ACC: 66.00964851826326%
 F1: 65.36810495791438%
 RECALL: 66.00964851826326%
 PRECISION: 65.56799062795456%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.6177790907049%
 ACC: 65.85802894555479%
 F1: 65.19661069485818%
 RECALL: 65.85802894555479%
 PRECISION: 65.30204830699488%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.71055542055142%
 ACC: 66.42315644383184%
 F1: 65.70326120094444%
 RECALL: 66.42315644383184%
 PRECISION: 65.78259990165755%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.25894688514244%
 ACC: 66.34045485871812%
 F1: 65.68386359326443%
 RECALL: 66.34045485871812%
 PRECISION: 65.74301775746511%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.55564465231276%
 ACC: 66.13370089593383%
 F1: 65.6272555571673%
 RECALL: 66.13370089593383%
 PRECISION: 65.8050720749131%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.18580796631066%
 ACC: 66.03721571330117%
 F1: 