In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("../../2-数据转换/data_nor.csv")
data_columns = pd.read_csv("../../3-特征选取/data_columns.csv")
columns = data_columns.columns.values
columns = np.append(columns, "risk")
data = data[columns]
print(data)

          city   latitude   longitude                 attacktype1_txt  \
0        Cairo  37.005105  -89.176269                   Armed Assault   
1      Oakland  37.791927 -122.225906               Bombing/Explosion   
2      Madison  43.076592  -89.412488  Facility/Infrastructure Attack   
3      Madison  43.072950  -89.386694  Facility/Infrastructure Attack   
4       Denver  39.758968 -104.876305  Facility/Infrastructure Attack   
...        ...        ...         ...                             ...   
72545     Aden  12.849085   45.037275               Bombing/Explosion   
72546    Bheri  28.709444   82.163611  Facility/Infrastructure Attack   
72547    Sabaa  15.305307   43.019490               Bombing/Explosion   
72548    Kabul  34.523842   69.140304                   Armed Assault   
72549  Wichita  37.688889  -97.336111                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1  

In [3]:
number_columns = [ col for col in data.columns if data[col].dtype != 'object' ]
number_columns.remove("risk")
#min-max
for col in number_columns:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
print(data)

          city  latitude  longitude                 attacktype1_txt  \
0        Cairo  0.681564   0.204755                   Armed Assault   
1      Oakland  0.688223   0.106546               Bombing/Explosion   
2      Madison  0.732945   0.204053  Facility/Infrastructure Attack   
3      Madison  0.732914   0.204130  Facility/Infrastructure Attack   
4       Denver  0.704869   0.158102  Facility/Infrastructure Attack   
...        ...       ...        ...                             ...   
72545     Aden  0.477141   0.603580               Bombing/Explosion   
72546    Bheri  0.611361   0.713903  Facility/Infrastructure Attack   
72547    Sabaa  0.497927   0.597584               Bombing/Explosion   
72548    Kabul  0.660566   0.675203                   Armed Assault   
72549  Wichita  0.687351   0.180508                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1                          

In [4]:
data.to_csv('data_stand.csv', index=False, encoding='utf_8_sig')

In [5]:
data = pd.read_csv("data_stand.csv")
category_col = ['attacktype1_txt', 'targsubtype1_txt', 'weapsubtype1_txt', 'city']
data[category_col] = data[category_col].astype('category')
X = data.drop(columns=['risk'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, shuffle=True, random_state=0)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [6]:
#lightGbm
import lightgbm as lgb

lgb_roc_scores = []
lgb_acc_scores = []
lgb_f1_scores = []
lgb_recall_scores = []
lgb_precision_scores = []
lgb_feature_importances = pd.DataFrame(index=None)
lgb_feature_importances['features'] = data.drop(['risk'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    labels = [0, 1, 2, 3]
    y_one_hot = label_binarize(y_test, classes=labels)
    
    LGB = lgb.LGBMClassifier(random_state=0)
    LGB.fit(X_train, y_train, categorical_feature=category_col)
    lgb_feature_importances[f'fold_{k+1}'] = LGB.feature_importances_
    y_pred_prob = LGB.predict_proba(X_test)
    y_pred = LGB.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
#     G_mean = math.sqrt(recall * specificity)
    print(f" Fold {k + 1} | " )
    print(f" AUC_ROC: { roc_auc * 100}%" )
    print(f" ACC: { acc * 100}%" )
    print(f" F1: { f1 * 100}%" )
    print(f" RECALL: { recall * 100}%" )
    print(f" PRECISION: { precision * 100}%" )
    lgb_f1_scores.append(f1)
    lgb_roc_scores.append(roc_auc)
    lgb_acc_scores.append(acc)
    lgb_recall_scores.append(recall)
    lgb_precision_scores.append(precision)
    
print(f'average roc score: {np.mean(lgb_roc_scores)}')
print(f'average acc_score: {np.mean(lgb_acc_scores)}')
print(f'average f1_score: {np.mean(lgb_f1_scores)}')
print(f'average recall_score: {np.mean(lgb_recall_scores)}')
print(f'average precision_score: {np.mean(lgb_precision_scores)}')

第 1 折

 Fold 1 | 
 AUC_ROC: 86.42818504369288%
 ACC: 66.28532046864231%
 F1: 65.7951748251071%
 RECALL: 66.28532046864231%
 PRECISION: 65.83722757859618%
第 2 折

 Fold 2 | 
 AUC_ROC: 86.47952736592082%
 ACC: 66.1474844934528%
 F1: 65.72740190901843%
 RECALL: 66.1474844934528%
 PRECISION: 65.7545612809392%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.6492288323521%
 ACC: 66.1474844934528%
 F1: 65.70565673970654%
 RECALL: 66.1474844934528%
 PRECISION: 65.68223586661016%
第 4 折

 Fold 4 | 
 AUC_ROC: 87.20226645302733%
 ACC: 66.74017918676775%
 F1: 66.12678951716553%
 RECALL: 66.74017918676775%
 PRECISION: 66.04349365816323%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.52840656872715%
 ACC: 66.4369400413508%
 F1: 65.99978612709194%
 RECALL: 66.4369400413508%
 PRECISION: 65.98509425812966%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.82200372233049%
 ACC: 66.79531357684355%
 F1: 66.43941097544366%
 RECALL: 66.79531357684355%
 PRECISION: 66.44951351078194%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.32005232513141%
 ACC: 66.03721571330117%
 F1: 65