In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("../../2-数据转换/data_nor.csv")
data_columns = pd.read_csv("../../3-特征选取/data_columns.csv")
columns = data_columns.columns.values
columns = np.append(columns, "risk")
data = data[columns]
print(data)

          city   latitude   longitude                 attacktype1_txt  \
0        Cairo  37.005105  -89.176269                   Armed Assault   
1      Oakland  37.791927 -122.225906               Bombing/Explosion   
2      Madison  43.076592  -89.412488  Facility/Infrastructure Attack   
3      Madison  43.072950  -89.386694  Facility/Infrastructure Attack   
4       Denver  39.758968 -104.876305  Facility/Infrastructure Attack   
...        ...        ...         ...                             ...   
72545     Aden  12.849085   45.037275               Bombing/Explosion   
72546    Bheri  28.709444   82.163611  Facility/Infrastructure Attack   
72547    Sabaa  15.305307   43.019490               Bombing/Explosion   
72548    Kabul  34.523842   69.140304                   Armed Assault   
72549  Wichita  37.688889  -97.336111                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1  

In [3]:
# LabelEncoder
encoder = LabelEncoder()
encoder.fit(list(data["city"].values))
data["city"] = encoder.transform(list(data["city"].values))

number_columns = [ col for col in data.columns if data[col].dtype != 'object' ]
number_columns.remove("risk")
#min-max
for col in number_columns:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

# one-hot
category_columns = [ col for col in data.columns if data[col].dtype == 'object' ]
data = pd.get_dummies(data, columns=category_columns)
print(data)

           city  latitude  longitude      date  risk  \
0      0.181904  0.681564   0.204755  0.500000     1   
1      0.688351  0.688223   0.106546  0.666667     1   
2      0.558561  0.732945   0.204053  0.666667     1   
3      0.558561  0.732914   0.204130  0.833333     1   
4      0.249211  0.704869   0.158102  0.166667     1   
...         ...       ...        ...       ...   ...   
72545  0.009873  0.477141   0.603580  0.500000     1   
72546  0.142187  0.611361   0.713903  0.500000     1   
72547  0.787440  0.497927   0.597584  0.500000     1   
72548  0.420521  0.660566   0.675203  0.500000     2   
72549  0.974123  0.687351   0.180508  0.500000     2   

       attacktype1_txt_Armed Assault  attacktype1_txt_Assassination  \
0                                  1                              0   
1                                  0                              0   
2                                  0                              0   
3                                  0       

In [4]:
data.to_csv('data_stand.csv', index=False, encoding='utf_8_sig')

In [5]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [6]:
#LR
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=5)
splits = kf.split(X, y)
# next(iter(splits))

lr_roc_scores = []
lr_precision_scores = []
lr_recall_scores = []
lr_f1_scores = []
lr_acc_scores = []
lr_feature_importances = pd.DataFrame(index=None)
lr_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    LR = LogisticRegression(random_state=0)
    LR.fit(X_train, y_train)
#     lr_feature_importances[f'fold_{k+1}'] = LR.feature_importance
    y_pred_prob =LR.predict_proba(X_test)
    y_pred=LR.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    lr_f1_scores.append(f1)
    lr_roc_scores.append(roc_auc)
    lr_acc_scores.append(acc)
    lr_recall_scores.append(recall)
    lr_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(lr_roc_scores)}')
print(f'average acc_score: {np.mean(lr_acc_scores)}')
print(f'average f1_score: {np.mean(lr_f1_scores)}')
print(f'average recall_score: {np.mean(lr_recall_scores)}')
print(f'average precision_score: {np.mean(lr_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 82.49604323134217%
 ACC: 59.441764300482426%
 F1: 59.09594602697749%
 RECALL: 59.441764300482426%
 PRECISION: 60.333972751802435%
第 2 折

 Fold 2 | 
 AUC_ROC: 78.16852291174453%
 ACC: 56.03032391454169%
 F1: 54.617106251795164%
 RECALL: 56.03032391454169%
 PRECISION: 55.50435849297429%
第 3 折

 Fold 3 | 
 AUC_ROC: 81.04962286424488%
 ACC: 59.29014472777395%
 F1: 58.249261580255066%
 RECALL: 59.29014472777395%
 PRECISION: 58.66007527768353%
第 4 折

 Fold 4 | 
 AUC_ROC: 84.5960822741468%
 ACC: 65.03790489317713%
 F1: 63.15675628449565%
 RECALL: 65.03790489317713%
 PRECISION: 64.49626792868331%
第 5 折

 Fold 5 | 
 AUC_ROC: 83.80813451245825%
 ACC: 63.383873190902825%
 F1: 62.084705213926014%
 RECALL: 63.383873190902825%
 PRECISION: 62.073906838550094%
average roc score: 0.8202368115878731
average acc_score: 0.6063680220537561
average f1_score: 0.5944075507148987
average recall_score: 0.6063680220537561
average precision_score: 0.6021371625793874


In [7]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [8]:
# adboost
from sklearn.ensemble import AdaBoostClassifier as ad

Ad_roc_scores = []
Ad_acc_scores = []
Ad_f1_scores = []
Ad_recall_scores = []
Ad_precision_scores = []
Ad_feature_importances = pd.DataFrame(index=None)
Ad_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    Ad = ad(random_state=0)
    Ad.fit(X_train, y_train)
    Ad_feature_importances[f'fold_{k+1}'] = Ad.feature_importances_
    y_pred_prob = Ad.predict_proba(X_test)
    y_pred = Ad.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    Ad_f1_scores.append(f1)
    Ad_roc_scores.append(roc_auc)
    Ad_acc_scores.append(acc)
    Ad_recall_scores.append(recall)
    Ad_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(Ad_roc_scores)}')
print(f'average acc_score: {np.mean(Ad_acc_scores)}')
print(f'average f1_score: {np.mean(Ad_f1_scores)}')
print(f'average recall_score: {np.mean(Ad_recall_scores)}')
print(f'average precision_score: {np.mean(Ad_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 78.79635307243944%
 ACC: 59.324603721571336%
 F1: 58.00040153660405%
 RECALL: 59.324603721571336%
 PRECISION: 58.1297283093992%
第 2 折

 Fold 2 | 
 AUC_ROC: 79.48432745145978%
 ACC: 60.56512749827705%
 F1: 58.95260613328297%
 RECALL: 60.56512749827705%
 PRECISION: 59.17356959235962%
第 3 折

 Fold 3 | 
 AUC_ROC: 79.42628434528743%
 ACC: 60.44107512060648%
 F1: 58.70423708407946%
 RECALL: 60.44107512060648%
 PRECISION: 59.038399310101966%
第 4 折

 Fold 4 | 
 AUC_ROC: 80.01608408016342%
 ACC: 61.29565816678153%
 F1: 59.582415582196546%
 RECALL: 61.29565816678153%
 PRECISION: 59.617362081664396%
第 5 折

 Fold 5 | 
 AUC_ROC: 79.49392396301458%
 ACC: 60.248104755341146%
 F1: 58.62316043020183%
 RECALL: 60.248104755341146%
 PRECISION: 58.67051641806753%
第 6 折

 Fold 6 | 
 AUC_ROC: 79.786010737791%
 ACC: 60.57891109579601%
 F1: 59.42941147466908%
 RECALL: 60.57891109579601%
 PRECISION: 59.804366795429345%
第 7 折

 Fold 7 | 
 AUC_ROC: 79.48743926029978%
 ACC: 60.744314266

In [9]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [10]:
# DT

DT_roc_scores = []
DT_acc_scores = []
DT_f1_scores = []
DT_recall_scores = []
DT_precision_scores = []
DT_feature_importances = pd.DataFrame(index=None)
DT_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    DT = DecisionTreeClassifier(random_state=0)
    DT.fit(X_train, y_train)
    DT_feature_importances[f'fold_{k+1}'] = DT.feature_importances_
    y_pred_prob = DT.predict_proba(X_test)
    y_pred = DT.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    DT_f1_scores.append(f1)
    DT_roc_scores.append(roc_auc)
    DT_acc_scores.append(acc)
    DT_recall_scores.append(recall)
    DT_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(DT_roc_scores)}')
print(f'average acc_score: {np.mean(DT_acc_scores)}')
print(f'average f1_score: {np.mean(DT_f1_scores)}')
print(f'average recall_score: {np.mean(DT_recall_scores)}')
print(f'average precision_score: {np.mean(DT_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 72.13917992199204%
 ACC: 58.78704341833219%
 F1: 58.69278257189928%
 RECALL: 58.78704341833219%
 PRECISION: 58.62208040598962%
第 2 折

 Fold 2 | 
 AUC_ROC: 72.01417454210402%
 ACC: 58.63542384562371%
 F1: 58.54700966756985%
 RECALL: 58.63542384562371%
 PRECISION: 58.49525794132116%
第 3 折

 Fold 3 | 
 AUC_ROC: 72.62716488565995%
 ACC: 59.4486560992419%
 F1: 59.439018010382625%
 RECALL: 59.4486560992419%
 PRECISION: 59.43093059833847%
第 4 折

 Fold 4 | 
 AUC_ROC: 72.82420732153649%
 ACC: 59.54514128187457%
 F1: 59.609357468690185%
 RECALL: 59.54514128187457%
 PRECISION: 59.69044697624839%
第 5 折

 Fold 5 | 
 AUC_ROC: 72.06785755607744%
 ACC: 58.552722260509995%
 F1: 58.563547136009085%
 RECALL: 58.552722260509995%
 PRECISION: 58.57596868074133%
第 6 折

 Fold 6 | 
 AUC_ROC: 71.86460142216212%
 ACC: 58.33218470020676%
 F1: 58.37673497437347%
 RECALL: 58.33218470020676%
 PRECISION: 58.45222750819171%
第 7 折

 Fold 7 | 
 AUC_ROC: 71.59296954440413%
 ACC: 58.22191592005

In [11]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [12]:
# RF
from sklearn.ensemble import RandomForestClassifier

RF_roc_scores = []
RF_acc_scores = []
RF_f1_scores = []
RF_recall_scores = []
RF_precision_scores = []
RFC_feature_importances = pd.DataFrame(index=None)
RFC_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    RFC = RandomForestClassifier(random_state=0)
    RFC.fit(X_train, y_train)
    RFC_feature_importances[f'fold_{k+1}'] = RFC.feature_importances_
    y_pred_prob = RFC.predict_proba(X_test)
    y_pred = RFC.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    RF_f1_scores.append(f1)
    RF_roc_scores.append(roc_auc)
    RF_acc_scores.append(acc)
    RF_recall_scores.append(recall)
    RF_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(RF_roc_scores)}')
print(f'average acc_score: {np.mean(RF_acc_scores)}')
print(f'average f1_score: {np.mean(RF_f1_scores)}')
print(f'average recall_score: {np.mean(RF_recall_scores)}')
print(f'average precision_score: {np.mean(RF_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 85.54233230664742%
 ACC: 65.41695382494831%
 F1: 65.11871854706473%
 RECALL: 65.41695382494831%
 PRECISION: 65.03216986929989%
第 2 折

 Fold 2 | 
 AUC_ROC: 85.91389061725218%
 ACC: 65.56857339765679%
 F1: 65.20639012210154%
 RECALL: 65.56857339765679%
 PRECISION: 65.13198411997652%
第 3 折

 Fold 3 | 
 AUC_ROC: 85.82973640467654%
 ACC: 65.73397656788423%
 F1: 65.43391452035343%
 RECALL: 65.73397656788423%
 PRECISION: 65.40413953828443%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.29465447392774%
 ACC: 66.4369400413508%
 F1: 66.06845052194431%
 RECALL: 66.4369400413508%
 PRECISION: 65.9786713157959%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.19378195693862%
 ACC: 66.21640248104755%
 F1: 65.84881550169248%
 RECALL: 66.21640248104755%
 PRECISION: 65.7568837754219%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.06706095064331%
 ACC: 66.05099931082012%
 F1: 65.79759117319558%
 RECALL: 66.05099931082012%
 PRECISION: 65.7700592100558%
第 7 折

 Fold 7 | 
 AUC_ROC: 85.53590170152646%
 ACC: 65.3618194348725%
 F1:

In [13]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [14]:
# xgboost
import xgboost as xgb

XGBR_roc_scores = []
XGBR_acc_scores = []
XGBR_f1_scores = []
XGBR_recall_scores = []
XGBR_precision_scores = []
xg_feature_importances = pd.DataFrame(index=None)
xg_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    XGBR = xgb.XGBClassifier(random_state=0)
    XGBR.fit(X_train, y_train)
    xg_feature_importances[f'fold_{k+1}'] = XGBR.feature_importances_
    y_pred_prob = XGBR.predict_proba(X_test)
    y_pred = XGBR.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    XGBR_f1_scores.append(f1)
    XGBR_roc_scores.append(roc_auc)
    XGBR_acc_scores.append(acc)
    XGBR_recall_scores.append(recall)
    XGBR_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(XGBR_roc_scores)}')
print(f'average acc_score: {np.mean(XGBR_acc_scores)}')
print(f'average f1_score: {np.mean(XGBR_f1_scores)}')
print(f'average recall_score: {np.mean(XGBR_recall_scores)}')
print(f'average precision_score: {np.mean(XGBR_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 85.97520071535637%
 ACC: 65.73397656788423%
 F1: 65.14907277217245%
 RECALL: 65.73397656788423%
 PRECISION: 65.2219019990898%
第 2 折

 Fold 2 | 
 AUC_ROC: 86.36657801127483%
 ACC: 66.31288766368021%
 F1: 65.71648741038888%
 RECALL: 66.31288766368021%
 PRECISION: 65.91107999072831%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.55242282427733%
 ACC: 65.95451412818745%
 F1: 65.33690000232603%
 RECALL: 65.95451412818745%
 PRECISION: 65.42044697035986%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.73855772085352%
 ACC: 66.65747760165404%
 F1: 65.93890163824125%
 RECALL: 66.65747760165404%
 PRECISION: 65.99464633720639%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.34719230901895%
 ACC: 66.7539627842867%
 F1: 66.20111712621201%
 RECALL: 66.7539627842867%
 PRECISION: 66.29108414247048%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.62860719712238%
 ACC: 66.29910406616126%
 F1: 65.86095606366852%
 RECALL: 66.29910406616126%
 PRECISION: 66.00193884765106%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.11360671168973%
 ACC: 65.69262577532736%
 

In [15]:
data = pd.read_csv("data_stand.csv")
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
data.to_csv('data_stand_lgb.csv', index=False, encoding='utf_8_sig')

In [16]:
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [17]:
#lightGbm
import lightgbm as lgb

lgb_roc_scores = []
lgb_acc_scores = []
lgb_f1_scores = []
lgb_recall_scores = []
lgb_precision_scores = []
lgb_feature_importances = pd.DataFrame(index=None)
lgb_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    
    LGB = lgb.LGBMClassifier(random_state=0)
    LGB.fit(X_train, y_train)
    lgb_feature_importances[f'fold_{k+1}'] = LGB.feature_importances_
    y_pred_prob = LGB.predict_proba(X_test)
    y_pred = LGB.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    lgb_f1_scores.append(f1)
    lgb_roc_scores.append(roc_auc)
    lgb_acc_scores.append(acc)
    lgb_recall_scores.append(recall)
    lgb_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(lgb_roc_scores)}')
print(f'average acc_score: {np.mean(lgb_acc_scores)}')
print(f'average f1_score: {np.mean(lgb_f1_scores)}')
print(f'average recall_score: {np.mean(lgb_recall_scores)}')
print(f'average precision_score: {np.mean(lgb_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 85.97613750882238%
 ACC: 65.66505858028945%
 F1: 65.03095282403936%
 RECALL: 65.66505858028945%
 PRECISION: 65.11245361804399%
第 2 折

 Fold 2 | 
 AUC_ROC: 86.38685370849292%
 ACC: 65.58235699517574%
 F1: 64.9565557080106%
 RECALL: 65.58235699517574%
 PRECISION: 65.12414029014823%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.5793931169096%
 ACC: 65.9131633356306%
 F1: 65.26310277180221%
 RECALL: 65.9131633356306%
 PRECISION: 65.36781568641574%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.71914264783017%
 ACC: 66.56099241902137%
 F1: 65.84455393795358%
 RECALL: 66.56099241902137%
 PRECISION: 65.94293554487005%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.26815135399791%
 ACC: 66.18883528600965%
 F1: 65.53447277657348%
 RECALL: 66.18883528600965%
 PRECISION: 65.63601055803275%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.58632841384896%
 ACC: 66.21640248104755%
 F1: 65.69887259162218%
 RECALL: 66.21640248104755%
 PRECISION: 65.88151598931444%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.1802615586895%
 ACC: 65.77532736044107%
 F1