In [2]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import svm
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("../../2-数据转换/data_nor.csv")
data_columns = pd.read_csv("../../3-特征选取/data_columns.csv")
columns = data_columns.columns.values
columns = np.append(columns, "risk")
data = data[columns]
print(data)
data.to_csv('data_stand.csv', index=False, encoding='utf_8_sig')

          city   latitude   longitude                 attacktype1_txt  \
0        Cairo  37.005105  -89.176269                   Armed Assault   
1      Oakland  37.791927 -122.225906               Bombing/Explosion   
2      Madison  43.076592  -89.412488  Facility/Infrastructure Attack   
3      Madison  43.072950  -89.386694  Facility/Infrastructure Attack   
4       Denver  39.758968 -104.876305  Facility/Infrastructure Attack   
...        ...        ...         ...                             ...   
72545     Aden  12.849085   45.037275               Bombing/Explosion   
72546    Bheri  28.709444   82.163611  Facility/Infrastructure Attack   
72547    Sabaa  15.305307   43.019490               Bombing/Explosion   
72548    Kabul  34.523842   69.140304                   Armed Assault   
72549  Wichita  37.688889  -97.336111                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1  

In [4]:
category_columns = [ col for col in data.columns if data[col].dtype == 'object' ]
print(len(category_columns))

4


In [5]:
# LabelEncoder
for col in tqdm_notebook(data[category_columns]):
    encoder = LabelEncoder()
    encoder.fit(list(data[col].values))
    data[col] = encoder.transform(list(data[col].values))
print(data)

  0%|          | 0/72550 [00:00<?, ?it/s]

        city   latitude   longitude  attacktype1_txt  targsubtype1_txt  \
0       4035  37.005105  -89.176269                0                73   
1      15269  37.791927 -122.225906                2                18   
2      12390  43.076592  -89.412488                3                50   
3      12390  43.072950  -89.386694                3                27   
4       5528  39.758968 -104.876305                3                50   
...      ...        ...         ...              ...               ...   
72545    219  12.849085   45.037275                2                35   
72546   3154  28.709444   82.163611                3                28   
72547  17467  15.305307   43.019490                2                32   
72548   9328  34.523842   69.140304                0                83   
72549  21608  37.688889  -97.336111                1                71   

       weapsubtype1_txt  date  risk  
0                    27     4     1  
1                    26     5     1

In [6]:
data.to_csv('data_stand.csv', index=False, encoding='utf_8_sig')

In [7]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [8]:
#LR
from sklearn.linear_model import LogisticRegression

lr_roc_scores = []
lr_precision_scores = []
lr_recall_scores = []
lr_f1_scores = []
lr_acc_scores = []
lr_feature_importances = pd.DataFrame(index=None)
lr_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    LR = LogisticRegression(random_state=0)
    LR.fit(X_train, y_train)
#     lr_feature_importances[f'fold_{k+1}'] = LR.feature_importance
    y_pred_prob =LR.predict_proba(X_test)
    y_pred=LR.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    lr_f1_scores.append(f1)
    lr_roc_scores.append(roc_auc)
    lr_acc_scores.append(acc)
    lr_recall_scores.append(recall)
    lr_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(lr_roc_scores)}')
print(f'average acc_score: {np.mean(lr_acc_scores)}')
print(f'average f1_score: {np.mean(lr_f1_scores)}')
print(f'average recall_score: {np.mean(lr_recall_scores)}')
print(f'average precision_score: {np.mean(lr_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 59.495720690765395%
 ACC: 38.0289455547898%
 F1: 35.04669789064763%
 RECALL: 38.0289455547898%
 PRECISION: 34.14692110373875%
第 2 折

 Fold 2 | 
 AUC_ROC: 60.422399491902155%
 ACC: 38.621640248104754%
 F1: 35.47330261607613%
 RECALL: 38.621640248104754%
 PRECISION: 33.73158867822167%
第 3 折

 Fold 3 | 
 AUC_ROC: 60.31843278788863%
 ACC: 38.69055823569952%
 F1: 35.39898773250248%
 RECALL: 38.69055823569952%
 PRECISION: 33.94853954083309%
第 4 折

 Fold 4 | 
 AUC_ROC: 59.09486047034961%
 ACC: 38.18056512749828%
 F1: 34.87340789051736%
 RECALL: 38.18056512749828%
 PRECISION: 33.50864891200811%
第 5 折

 Fold 5 | 
 AUC_ROC: 59.388964721836636%
 ACC: 38.52515506547209%
 F1: 35.416369731790866%
 RECALL: 38.52515506547209%
 PRECISION: 34.19639225937714%
第 6 折

 Fold 6 | 
 AUC_ROC: 60.160897975554185%
 ACC: 39.14541695382495%
 F1: 36.102964699107325%
 RECALL: 39.14541695382495%
 PRECISION: 48.38935110660188%
第 7 折

 Fold 7 | 
 AUC_ROC: 59.654997293161095%
 ACC: 37.9048931

In [9]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [10]:
# adboost
from sklearn.ensemble import AdaBoostClassifier as ad

Ad_roc_scores = []
Ad_acc_scores = []
Ad_f1_scores = []
Ad_recall_scores = []
Ad_precision_scores = []
Ad_feature_importances = pd.DataFrame(index=None)
Ad_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    Ad = ad(random_state=0)
    Ad.fit(X_train, y_train)
    Ad_feature_importances[f'fold_{k+1}'] = Ad.feature_importances_
    y_pred_prob = Ad.predict_proba(X_test)
    y_pred = Ad.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    Ad_f1_scores.append(f1)
    Ad_roc_scores.append(roc_auc)
    Ad_acc_scores.append(acc)
    Ad_recall_scores.append(recall)
    Ad_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(Ad_roc_scores)}')
print(f'average acc_score: {np.mean(Ad_acc_scores)}')
print(f'average f1_score: {np.mean(Ad_f1_scores)}')
print(f'average recall_score: {np.mean(Ad_recall_scores)}')
print(f'average precision_score: {np.mean(Ad_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 77.89414467155228%
 ACC: 56.884906960716755%
 F1: 54.795443244186345%
 RECALL: 56.884906960716755%
 PRECISION: 55.06083700761432%
第 2 折

 Fold 2 | 
 AUC_ROC: 78.58840266817676%
 ACC: 58.27705031013094%
 F1: 56.55834650329956%
 RECALL: 58.27705031013094%
 PRECISION: 56.74218896404817%
第 3 折

 Fold 3 | 
 AUC_ROC: 78.40608377363885%
 ACC: 57.780840799448654%
 F1: 55.850852622988626%
 RECALL: 57.780840799448654%
 PRECISION: 56.19652176867235%
第 4 折

 Fold 4 | 
 AUC_ROC: 78.68929084793358%
 ACC: 57.86354238456237%
 F1: 55.83227042594901%
 RECALL: 57.86354238456237%
 PRECISION: 56.396160255494976%
第 5 折

 Fold 5 | 
 AUC_ROC: 78.4492727533115%
 ACC: 57.808407994486565%
 F1: 55.76411890939663%
 RECALL: 57.808407994486565%
 PRECISION: 55.74652955310974%
第 6 折

 Fold 6 | 
 AUC_ROC: 78.87511830946784%
 ACC: 58.056512749827704%
 F1: 56.56125847668094%
 RECALL: 58.056512749827704%
 PRECISION: 56.68342185405455%
第 7 折

 Fold 7 | 
 AUC_ROC: 78.3376291757779%
 ACC: 57.47760

In [11]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [12]:
# DT

DT_roc_scores = []
DT_acc_scores = []
DT_f1_scores = []
DT_recall_scores = []
DT_precision_scores = []
DT_feature_importances = pd.DataFrame(index=None)
DT_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    DT = DecisionTreeClassifier(random_state=0)
    DT.fit(X_train, y_train)
    DT_feature_importances[f'fold_{k+1}'] = DT.feature_importances_
    y_pred_prob = DT.predict_proba(X_test)
    y_pred = DT.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    DT_f1_scores.append(f1)
    DT_roc_scores.append(roc_auc)
    DT_acc_scores.append(acc)
    DT_recall_scores.append(recall)
    DT_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(DT_roc_scores)}')
print(f'average acc_score: {np.mean(DT_acc_scores)}')
print(f'average f1_score: {np.mean(DT_f1_scores)}')
print(f'average recall_score: {np.mean(DT_recall_scores)}')
print(f'average precision_score: {np.mean(DT_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 71.44132483183432%
 ACC: 57.65678842177808%
 F1: 57.60825253341763%
 RECALL: 57.65678842177808%
 PRECISION: 57.59186837499336%
第 2 折

 Fold 2 | 
 AUC_ROC: 71.39584163919285%
 ACC: 57.62922122674018%
 F1: 57.61027798411932%
 RECALL: 57.62922122674018%
 PRECISION: 57.67540108960642%
第 3 折

 Fold 3 | 
 AUC_ROC: 71.60871250236418%
 ACC: 57.794624396967606%
 F1: 57.81206028528364%
 RECALL: 57.794624396967606%
 PRECISION: 57.845345321329674%
第 4 折

 Fold 4 | 
 AUC_ROC: 71.56026160351338%
 ACC: 57.794624396967606%
 F1: 57.783698602011846%
 RECALL: 57.794624396967606%
 PRECISION: 57.81421276378457%
第 5 折

 Fold 5 | 
 AUC_ROC: 71.8782839920713%
 ACC: 58.18056512749828%
 F1: 58.207617272230436%
 RECALL: 58.18056512749828%
 PRECISION: 58.23625525082801%
第 6 折

 Fold 6 | 
 AUC_ROC: 71.38724939891567%
 ACC: 57.684355616815985%
 F1: 57.74655340785255%
 RECALL: 57.684355616815985%
 PRECISION: 57.850272292573635%
第 7 折

 Fold 7 | 
 AUC_ROC: 70.93656591194998%
 ACC: 57.24328

In [13]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [14]:
# RF
from sklearn.ensemble import RandomForestClassifier

RF_roc_scores = []
RF_acc_scores = []
RF_f1_scores = []
RF_recall_scores = []
RF_precision_scores = []
RFC_feature_importances = pd.DataFrame(index=None)
RFC_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    RFC = RandomForestClassifier(random_state=0)
    RFC.fit(X_train, y_train)
    RFC_feature_importances[f'fold_{k+1}'] = RFC.feature_importances_
    y_pred_prob = RFC.predict_proba(X_test)
    y_pred = RFC.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    RF_f1_scores.append(f1)
    RF_roc_scores.append(roc_auc)
    RF_acc_scores.append(acc)
    RF_recall_scores.append(recall)
    RF_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(RF_roc_scores)}')
print(f'average acc_score: {np.mean(RF_acc_scores)}')
print(f'average f1_score: {np.mean(RF_f1_scores)}')
print(f'average recall_score: {np.mean(RF_recall_scores)}')
print(f'average precision_score: {np.mean(RF_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 84.66296917462253%
 ACC: 64.02481047553411%
 F1: 63.693272078574395%
 RECALL: 64.02481047553411%
 PRECISION: 63.62375037236222%
第 2 折

 Fold 2 | 
 AUC_ROC: 85.25494523686513%
 ACC: 65.32046864231565%
 F1: 64.94726947220771%
 RECALL: 65.32046864231565%
 PRECISION: 64.8727263544183%
第 3 折

 Fold 3 | 
 AUC_ROC: 85.28019385640386%
 ACC: 64.98966230186079%
 F1: 64.6946717972005%
 RECALL: 64.98966230186079%
 PRECISION: 64.67994425531754%
第 4 折

 Fold 4 | 
 AUC_ROC: 85.43184974623318%
 ACC: 65.51343900758097%
 F1: 65.06849815733709%
 RECALL: 65.51343900758097%
 PRECISION: 65.02139033108408%
第 5 折

 Fold 5 | 
 AUC_ROC: 85.4171212422545%
 ACC: 65.3893866299104%
 F1: 65.009078117473%
 RECALL: 65.3893866299104%
 PRECISION: 64.98833709318305%
第 6 折

 Fold 6 | 
 AUC_ROC: 85.62753082724791%
 ACC: 65.74776016540316%
 F1: 65.50561564114335%
 RECALL: 65.74776016540316%
 PRECISION: 65.51434277714124%
第 7 折

 Fold 7 | 
 AUC_ROC: 84.66230713431379%
 ACC: 63.818056512749834%
 F1

In [15]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [16]:
# xgboost
import xgboost as xgb

XGBR_roc_scores = []
XGBR_acc_scores = []
XGBR_f1_scores = []
XGBR_recall_scores = []
XGBR_precision_scores = []
xg_feature_importances = pd.DataFrame(index=None)
xg_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    XGBR = xgb.XGBClassifier(random_state=0)
    XGBR.fit(X_train, y_train)
    xg_feature_importances[f'fold_{k+1}'] = XGBR.feature_importances_
    y_pred_prob = XGBR.predict_proba(X_test)
    y_pred = XGBR.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    XGBR_f1_scores.append(f1)
    XGBR_roc_scores.append(roc_auc)
    XGBR_acc_scores.append(acc)
    XGBR_recall_scores.append(recall)
    XGBR_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(XGBR_roc_scores)}')
print(f'average acc_score: {np.mean(XGBR_acc_scores)}')
print(f'average f1_score: {np.mean(XGBR_f1_scores)}')
print(f'average recall_score: {np.mean(XGBR_recall_scores)}')
print(f'average precision_score: {np.mean(XGBR_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 86.21155205816662%
 ACC: 65.58235699517574%
 F1: 64.9568512525852%
 RECALL: 65.58235699517574%
 PRECISION: 65.01598618826502%
第 2 折

 Fold 2 | 
 AUC_ROC: 86.61890467198347%
 ACC: 66.60234321157822%
 F1: 66.09829294472986%
 RECALL: 66.60234321157822%
 PRECISION: 66.20797932981401%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.60464306484634%
 ACC: 66.32667126119918%
 F1: 65.79207206684619%
 RECALL: 66.32667126119918%
 PRECISION: 65.85808972814662%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.80263810122085%
 ACC: 66.68504479669194%
 F1: 66.03402092611542%
 RECALL: 66.68504479669194%
 PRECISION: 66.03540501376753%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.4234078525145%
 ACC: 66.58855961405928%
 F1: 66.07894632069517%
 RECALL: 66.58855961405928%
 PRECISION: 66.18344987563587%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.77614040192404%
 ACC: 66.68504479669194%
 F1: 66.26554726048252%
 RECALL: 66.68504479669194%
 PRECISION: 66.38311709007755%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.25481680192861%
 ACC: 66.2301860785665%
 

In [17]:
data = pd.read_csv("data_stand.csv")
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
data.to_csv('data_stand_lgb.csv', index=False, encoding='utf_8_sig')

In [18]:
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [19]:
#lightGbm
import lightgbm as lgb

lgb_roc_scores = []
lgb_acc_scores = []
lgb_f1_scores = []
lgb_recall_scores = []
lgb_precision_scores = []
lgb_feature_importances = pd.DataFrame(index=None)
lgb_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    
    LGB = lgb.LGBMClassifier(random_state=0)
    LGB.fit(X_train, y_train)
    lgb_feature_importances[f'fold_{k+1}'] = LGB.feature_importances_
    y_pred_prob = LGB.predict_proba(X_test)
    y_pred = LGB.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    lgb_f1_scores.append(f1)
    lgb_roc_scores.append(roc_auc)
    lgb_acc_scores.append(acc)
    lgb_recall_scores.append(recall)
    lgb_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(lgb_roc_scores)}')
print(f'average acc_score: {np.mean(lgb_acc_scores)}')
print(f'average f1_score: {np.mean(lgb_f1_scores)}')
print(f'average recall_score: {np.mean(lgb_recall_scores)}')
print(f'average precision_score: {np.mean(lgb_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 85.85809759088481%
 ACC: 65.0172294968987%
 F1: 64.36160155340579%
 RECALL: 65.0172294968987%
 PRECISION: 64.46476286898091%
第 2 折

 Fold 2 | 
 AUC_ROC: 86.39187779981141%
 ACC: 65.77532736044107%
 F1: 65.21255040086903%
 RECALL: 65.77532736044107%
 PRECISION: 65.39327879493487%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.3827201498445%
 ACC: 65.85802894555479%
 F1: 65.34010733999519%
 RECALL: 65.85802894555479%
 PRECISION: 65.4679752339074%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.5801876822008%
 ACC: 66.07856650585802%
 F1: 65.3717878147786%
 RECALL: 66.07856650585802%
 PRECISION: 65.41437522404117%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.28189452583294%
 ACC: 66.25775327360441%
 F1: 65.70012295636057%
 RECALL: 66.25775327360441%
 PRECISION: 65.7644584806688%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.63072072165295%
 ACC: 66.7539627842867%
 F1: 66.25352087508291%
 RECALL: 66.7539627842867%
 PRECISION: 66.38938668740424%
第 7 折

 Fold 7 | 
 AUC_ROC: 85.87288265106751%
 ACC: 65.52722260509994%
 F1: 64