In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.book import *
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm_notebook
import joblib
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import defaultdict
from gensim import corpora
from gensim import models
import warnings
warnings.filterwarnings('ignore')

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [8]:
data = pd.read_csv("../../2-数据转换/data_nor.csv")
data_columns = pd.read_csv("../../3-特征选取/data_columns.csv")
columns = data_columns.columns.values
columns = np.append(columns, ["risk", "summary"])
data = data[columns]
print(data)

          city   latitude   longitude                 attacktype1_txt  \
0        Cairo  37.005105  -89.176269                   Armed Assault   
1      Oakland  37.791927 -122.225906               Bombing/Explosion   
2      Madison  43.076592  -89.412488  Facility/Infrastructure Attack   
3      Madison  43.072950  -89.386694  Facility/Infrastructure Attack   
4       Denver  39.758968 -104.876305  Facility/Infrastructure Attack   
...        ...        ...         ...                             ...   
72545     Aden  12.849085   45.037275               Bombing/Explosion   
72546    Bheri  28.709444   82.163611  Facility/Infrastructure Attack   
72547    Sabaa  15.305307   43.019490               Bombing/Explosion   
72548    Kabul  34.523842   69.140304                   Armed Assault   
72549  Wichita  37.688889  -97.336111                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1  

In [9]:
# LabelEncoder
encoder = LabelEncoder()
encoder.fit(list(data["city"].values))
data["city"] = encoder.transform(list(data["city"].values))

number_columns = [ col for col in data.columns if data[col].dtype != 'object' ]
number_columns.remove("risk")
#min-max
for col in number_columns:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
print(data)

           city  latitude  longitude                 attacktype1_txt  \
0      0.181904  0.681564   0.204755                   Armed Assault   
1      0.688351  0.688223   0.106546               Bombing/Explosion   
2      0.558561  0.732945   0.204053  Facility/Infrastructure Attack   
3      0.558561  0.732914   0.204130  Facility/Infrastructure Attack   
4      0.249211  0.704869   0.158102  Facility/Infrastructure Attack   
...         ...       ...        ...                             ...   
72545  0.009873  0.477141   0.603580               Bombing/Explosion   
72546  0.142187  0.611361   0.713903  Facility/Infrastructure Attack   
72547  0.787440  0.497927   0.597584               Bombing/Explosion   
72548  0.420521  0.660566   0.675203                   Armed Assault   
72549  0.974123  0.687351   0.180508                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1              

In [10]:
data.to_csv('data_txt.csv', index=False, encoding='utf_8_sig')

In [11]:
# Bag
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [12]:
for row in tqdm_notebook(range(data.summary.shape[0])):
    data.summary[row] = data.summary[row].lower()
# print(data.summary)
print(data)

  0%|          | 0/72550 [00:00<?, ?it/s]

           city  latitude  longitude                 attacktype1_txt  \
0      0.181904  0.681564   0.204755                   Armed Assault   
1      0.688351  0.688223   0.106546               Bombing/Explosion   
2      0.558561  0.732945   0.204053  Facility/Infrastructure Attack   
3      0.558561  0.732914   0.204130  Facility/Infrastructure Attack   
4      0.249211  0.704869   0.158102  Facility/Infrastructure Attack   
...         ...       ...        ...                             ...   
72545  0.009873  0.477141   0.603580               Bombing/Explosion   
72546  0.142187  0.611361   0.713903  Facility/Infrastructure Attack   
72547  0.787440  0.497927   0.597584               Bombing/Explosion   
72548  0.420521  0.660566   0.675203                   Armed Assault   
72549  0.974123  0.687351   0.180508                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1              

In [13]:
num_reviews = data["summary"].size
# Initialize an empty list to hold the clean reviews
print("Cleaning and parsing the training set reviews...\n")
clean_train_reviews = []
for i in range(num_reviews):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d" % ( i+1, num_reviews ))                                                                    
    clean_train_reviews.append( review_to_words( data["summary"][i] ))

Cleaning and parsing the training set reviews...

Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 

In [14]:
print( "Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
# features = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for i in tqdm_notebook(range(100, 1500, 100)):
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = i) 
    train_data_features = vectorizer.fit_transform(clean_train_reviews)
    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_featuress = train_data_features.toarray()
    # print(train_data_featuress)
    column_summary = []
    for j in range(train_data_featuress.shape[1]):
        col = "summary_" + str(j)
        column_summary.append(col)
    data_summary = pd.DataFrame(train_data_featuress, columns=column_summary)
    data_concat = pd.concat([data, data_summary], axis=1, ignore_index=False)
    data_concat.drop(columns=['summary'], axis=1, inplace=True)
    name = "data_bow_nt_" + str(i) + '.csv'
    data_concat.to_csv(name, index=False, encoding='utf_8_sig')
#     features = features + 10
    print(name)

Creating the bag of words...



  0%|          | 0/14 [00:00<?, ?it/s]

data_bow_nt_100.csv
data_bow_nt_200.csv
data_bow_nt_300.csv
data_bow_nt_400.csv
data_bow_nt_500.csv
data_bow_nt_600.csv
data_bow_nt_700.csv
data_bow_nt_800.csv
data_bow_nt_900.csv
data_bow_nt_1000.csv
data_bow_nt_1100.csv
data_bow_nt_1200.csv
data_bow_nt_1300.csv
data_bow_nt_1400.csv


In [15]:
def fitness_func(solution, solution_idx):
    # 从解决方案中提取超参数
    learning_rate = solution[0]
    min_child_samples = int(solution[1])
    max_depth = int(solution[2])
    num_leaves = int(solution[3])
    colsample_bytree = (solution[4])
    reg_alpha = solution[5]
    reg_lambda = solution[6]
    
    print(learning_rate, min_child_samples, max_depth, num_leaves,
          colsample_bytree, reg_alpha, reg_lambda)
    # 定义LightBGM的函数
    LGB = lgb.LGBMClassifier(learning_rate=learning_rate, # 学习率
                             min_child_samples=min_child_samples,
                             max_depth=max_depth, # 树的最大深度
                             num_leaves=num_leaves, 
                             colsample_bytree=colsample_bytree,
                             reg_alpha=reg_alpha,
                             reg_lambda=reg_lambda,
                             random_state=0 # 随机种子
                            )

    # 利用训练数据训练LightLGB分类器
    LGB.fit(X_train, y_train, categorical_feature=category_col)
    # 对测试数据进行预测
#     y_pred_prob = LGB.predict_proba(X_test)
    y_pred = LGB.predict(X_test)
    # 计算准确率
#     acc = accuracy_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
#     precision = precision_score(y_test, y_pred, average='weighted')
#     roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average="weighted")
#     print('当前准确率：', precision)
    # 返回适应函数分数（准确性）
    fitness = recall
    return fitness

In [16]:
data = pd.read_csv("data_txt.csv")
data.drop(columns=["summary"], axis=1, inplace=True)
print(data)
data.to_csv('data_bow_nt_0.csv', index=False, encoding='utf_8_sig')

           city  latitude  longitude                 attacktype1_txt  \
0      0.181904  0.681564   0.204755                   Armed Assault   
1      0.688351  0.688223   0.106546               Bombing/Explosion   
2      0.558561  0.732945   0.204053  Facility/Infrastructure Attack   
3      0.558561  0.732914   0.204130  Facility/Infrastructure Attack   
4      0.249211  0.704869   0.158102  Facility/Infrastructure Attack   
...         ...       ...        ...                             ...   
72545  0.009873  0.477141   0.603580               Bombing/Explosion   
72546  0.142187  0.611361   0.713903  Facility/Infrastructure Attack   
72547  0.787440  0.497927   0.597584               Bombing/Explosion   
72548  0.420521  0.660566   0.675203                   Armed Assault   
72549  0.974123  0.687351   0.180508                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1              

In [17]:
import pygad

max__ = []
lr__ = []
mcs__ = []
md__ = []
nl__ = []
cb__ = []
ra__ = []
rl__ = []

for i in tqdm_notebook(range(0, 1500, 100)):
    name = "data_bow_nt_" + str(i) + '.csv'
    data = pd.read_csv(name)
    
    max_ = {'max': 0, 
           'learning_rate': 0,
           'min_child_samples': 0,              
           'max_depth': 0,
           'num_leaves': 0, 
           'colsample_bytree': 0,
           'reg_alpha': 0,
           'reg_lambda': 0}
    
    category_col = ['attacktype1_txt', 'targsubtype1_txt', 'weapsubtype1_txt']
    data[category_col] = data[category_col].astype('category')
    X = data.drop(columns=['risk'], axis=1)
    y = data['risk']
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    splits = kf.split(X, y)
    
    for k, (train_indices, test_indices) in enumerate(splits):
        print("第 %d 折\n" % (k + 1))
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        labels = [0, 1, 2, 3]
        y_one_hot = label_binarize(y_test, classes=labels)
        
        param_list = [
        {'low': 0.01, 'high': 1}, # learning_rate 
        {'low': 5, 'high': 50}, # min_child_samples
        {'low': 1, 'high': 15}, # max_depth 
        {'low': 5, 'high':100}, #num_leaves 
        {'low': 0.1, 'high': 1}, #colsample_bytree
        {'low': 0, 'high': 100}, # reg_alpha
        {'low': 0, 'high': 100}, #reg_lambda
        ]
    
        # 定义遗传算法
        ga_instance = pygad.GA(num_generations=4, # 遗传算法的代数 
                               num_parents_mating=2, # 每代选择交叉的父代数量 
                               fitness_func=fitness_func, # 适应函数 
        #                            initial_population=[20, 8],
                               sol_per_pop=3, # 种群中的解决方案数量 
                               num_genes=len(param_list), # 解决方案中的基因数量（即超参数数量） 
                               gene_type=float, # 基因类型（即超参数类型） 
                               gene_space=param_list, # 基因空间（即超参数范围） 
                               parent_selection_type='rws', # 父代选择类型 
                               keep_parents=1, # 保留的父代数量 
                               crossover_type='uniform', # 交叉类型 
                               crossover_probability=0.6,
                               mutation_type='random', # 变异类型 
                               mutation_probability=0.01,
        #                            mutation_percent_genes=10 # 变异基因百分比
                               random_seed=0
                              )
        # 开始遗传算法
        ga_instance.run()
        
        # 获取最优超参数组合
        best_solution, best_fitness, best_solution_idx = ga_instance.best_solution()
        best_learning_rate = best_solution[0] 
        best_min_child_samples = int(best_solution[1])
        best_max_depth = int(best_solution[2])
        best_num_leaves = int(best_solution[3])
        best_colsample_bytree = (best_solution[4])
        best_reg_alpha = best_solution[5]
        best_reg_lambda = best_solution[6]

        # 打印最佳解决方案和最佳适应值
        print('Best solution is {solution} with fitness value {fitness}'.format(solution=best_solution, fitness=best_fitness)) 
        print('Best learning rate is {lr}'.format(lr=best_learning_rate)) 
        print('Best min child samples is {mcs}'.format(mcs=best_min_child_samples))
        print('Best max depth is {md}'.format(md=best_max_depth)) 
        print('Best num leaves is {nl}'.format(nl=best_num_leaves))  
        print('Best colsample bytree is {cb}'.format(cb=best_colsample_bytree))
        print('Best reg_alpha is {al}'.format(al=best_reg_alpha))
        print('Best reg_lambda is {la}'.format(la=best_reg_lambda))
        
        
        

        if best_fitness > max_['max']:
            max_['max'] = best_fitness
            max_['learning_rate'] = best_learning_rate 
            max_['min_child_samples'] = best_min_child_samples
            max_['max_depth'] = best_max_depth
            max_['num_leaves'] = best_num_leaves
            max_['colsample_bytree'] = best_colsample_bytree
            max_['reg_alpha'] = best_reg_alpha
            max_['reg_lambda'] = best_reg_lambda
            
    max__.append(max_['max'])
    lr__.append(max_['learning_rate'])
    mcs__.append(max_['min_child_samples'])
    md__.append(max_['max_depth'])
    nl__.append(max_['num_leaves'])
    cb__.append(max_['colsample_bytree'])
    ra__.append(max_['reg_alpha'])
    rl__.append(max_['reg_lambda'])
        
print(max__, lr__, mcs__, md__, nl__, cb__, ra__, rl__)

  0%|          | 0/15 [00:00<?, ?it/s]

第 1 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.76167462 64.58941131
 43.75872113] with fitness value 0.6610613370089593
Best learning rate is 0.5533253688880515
B

0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
Best solution is [ 0.89285527 48.36482422  6.36818126 56.76390238  0.76167462 56.80445611
 92.55966383] with fitness value 0.6526533425223984
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 56
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 56.80445610939323
Best reg_lambda is 92.5596638292661
第 9 折

0.5533253688880515 37 9 56 0.4812893194050143 64.5894113

0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 80 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 48 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
Best solution is [ 0.89285527 48.36482422  6.36818126 80.21387862  0.57600543 56.80445611
 92.55966383] with fitness value 0.7230875258442453
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 80
Best colsample bytree is 0.5760054277776141
Best reg_alpha is 56.80445610939323
Best reg_lambda is 92.5596638292661
第 6 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130

0.892855270774259 48 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
Best solution is [ 0.89285527 48.36482422  6.36818126 80.21387862  0.76167462 56.80445611
 92.55966383] with fitness value 0.7385251550654721
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 80
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 56.80445610939323
Best reg_lambda is 92.5596638292661
第 3 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.804456109

0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
Best solution is [ 0.89285527 48.36482422  6.36818126 80.21387862  0.57600543 56.80445611
 92.55966383] with fitness value 0.7328738800827016
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 80
Best colsample bytre

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 37 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 37 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7419710544452102
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 8 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610

0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 56 0.7616746199103354 64.58941130666561 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7396278428669882
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 5 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.804456

Best solution is [ 0.89285527 48.36482422  6.36818126 56.76390238  0.76167462 56.80445611
 92.55966383] with fitness value 0.7364576154376292
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 56
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 56.80445610939323
Best reg_lambda is 92.5596638292661
第 2 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130

0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.76167462 64.58941131
 43.75872113] with fitness value 0.7441764300482426
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 80
Best colsample bytr

0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 48 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 48 9 56 0.7616746199103354 64.58941130666561 43.75872112626925
0.5533253688880515 48 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 48 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
Best solution is [ 0.55332537 48.36482422  9.43868727 80.21387862  0.48128932 56.80445611
 43.75872113] with fitness value 0.7346657477601654
Best learning rate is 0.5533253688880515
Best min child samples is 48
Best max depth is 9
Best num leaves is 80
Best colsample 

0.892855270774259 48 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7349414197105445
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 4 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 48 9 80 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 48 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
Best solution is [ 0.89285527 48.36482422  6.36818126 80

0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
Best solution is [ 0.89285527 48.36482422  6.36818126 56.76390238  0.76167462 56.80445611
 92.55966383] with fitness value 0.7385251550654721
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 56
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 56.80445610939323
Best reg_lambda is

0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 80 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 56.80445610939323 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.76167462 56.80445611
 92.55966383] with fitness value 0.7415575465196417
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 80
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 56.80445610939323
Best reg_lambda is 

0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.48128932 56.80445611
 43.75872113] with fitness value 0.7458304617505169
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 80
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 56.80445610939323
Best reg_lambda is 43.75872112626925
第 6 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.001

0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.48128932 56.80445611
 43.75872113] with fitness value 0.7421088904203997
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 80
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 56.80445610939323
Best reg_lambda is 43.75872112626925
第 3 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 56 0.7616746199103354 64.58941130666561 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 56 0.7616746199103354 64.58941130666561 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.76167462 64.58941131
 43.75872113] with fitness value 0.7345279117849759
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 1 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.804456

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 48 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.746106133700896
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 8 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.804456109

0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
Best solution is [ 0.55332537 48.36482422  9.43868727 56.76390238  0.76167462 64.58941131
 43.75872113] with fitness value 0.7454169538249483
Best learning rate is 0.5533253688880515
Best min child samples is 48
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 5 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 48 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.8044561

0.5533253688880515 48 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
Best solution is [ 0.55332537 48.36482422  9.43868727 80.21387862  0.48128932 56.80445611
 43.75872113] with fitness value 0.7393521709166092
Best learning rate is 0.5533253688880515
Best min child samples is 48
Best max depth is 9
Best num leaves is 80
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 56.80445610939323
Best reg_lambda is 43.75872112626925
第 2 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.589411

0.5533253688880515 37 9 80 0.7616746199103354 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 80 0.7616746199103354 56.80445610939323 43.75872112626925
Best solution is [ 0.89285527 48.36482422  6.36818126 80.21387862  0.57600543 56.80445611
 92.55966383] with fitness value 0.7474844934527912
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 80
Best colsample bytree is 0.5760054277776141
Best reg_alpha is 56.80445610939323
Best reg_lambda is 92.5596638292661
第 9 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.589411

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
Best solution is [ 0.89285527 48.36482422  6.36818126 80.21387862  0.57600543 56.80445611
 92.55966383] with fitness value 0.7401791867677464
Best learning rate is 0.892855270774259
Best 

In [18]:
print(max__)

[0.6657477601654032, 0.7291523087525844, 0.743762922122674, 0.7455547898001378, 0.7478980013783597, 0.7441764300482426, 0.7456926257753274, 0.749276361130255, 0.7474844934527912, 0.7485871812543073, 0.749276361130255, 0.7502412129565816, 0.7494141971054445, 0.7498277050310131, 0.7507925568573398]


In [9]:
max__ = [0.6657477601654032, 0.7291523087525844, 0.743762922122674, 0.7455547898001378, 0.7478980013783597, 0.7441764300482426, 0.7456926257753274, 0.749276361130255, 0.7474844934527912, 0.7485871812543073, 0.749276361130255, 0.7502412129565816, 0.7494141971054445, 0.7498277050310131, 0.7507925568573398] 
lr__ = [0.5533253688880515, 0.892855270774259, 0.5533253688880515, 0.892855270774259, 0.892855270774259, 0.5533253688880515, 0.5533253688880515, 0.892855270774259, 0.5533253688880515, 0.892855270774259, 0.892855270774259, 0.5533253688880515, 0.892855270774259, 0.892855270774259, 0.5533253688880515] 
mcs__ = [37, 48, 37, 48, 48, 37, 48, 48, 37, 48, 48, 37, 37, 48, 37] 
md__ = [9, 6, 9, 6, 6, 9, 9, 6, 9, 6, 6, 9, 6, 6, 9] 
nl__ = [80, 80, 80, 80, 80, 80, 56, 80, 80, 80, 80, 56, 80, 80, 80] 
cb__ = [0.7616746199103354, 0.5760054277776141, 0.7616746199103354, 0.5760054277776141, 0.5760054277776141, 0.7616746199103354, 0.7616746199103354, 0.5760054277776141, 0.7616746199103354, 0.5760054277776141, 0.5760054277776141, 0.4812893194050143, 0.7616746199103354, 0.5760054277776141, 0.4812893194050143] 
ra__ = [64.58941130666561, 56.80445610939323, 56.80445610939323, 56.80445610939323, 56.80445610939323, 64.58941130666561, 64.58941130666561, 56.80445610939323, 64.58941130666561, 56.80445610939323, 56.80445610939323, 64.58941130666561, 56.80445610939323, 56.80445610939323, 56.80445610939323] 
rl__ = [43.75872112626925, 92.5596638292661, 43.75872112626925, 92.5596638292661, 92.5596638292661, 43.75872112626925, 43.75872112626925, 92.5596638292661, 43.75872112626925, 92.5596638292661, 92.5596638292661, 43.75872112626925, 92.5596638292661, 92.5596638292661, 43.75872112626925]

In [10]:
count = 0
roc_ = []
acc_ = []
f1_ = []
recall_ = []
precision_ = []
ii = []

for i in tqdm_notebook(range(0, 1500, 100)):
    name = "data_bow_nt_" + str(i) + '.csv'
    data = pd.read_csv(name)

    category_col = ['attacktype1_txt', 'targsubtype1_txt', 'weapsubtype1_txt']
    data[category_col] = data[category_col].astype('category')
    X = data.drop(columns=['risk'], axis=1)
    y = data['risk']
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    splits = kf.split(X, y)
    
    #lightGbm
    lgb_roc_scores = []
    lgb_acc_scores = []
    lgb_f1_scores = []
    lgb_recall_scores = []
    lgb_precision_scores = []
    lgb_feature_importances = pd.DataFrame(index=None)
    lgb_feature_importances['features'] = data.drop(['risk'], axis=1).columns

    for k, (train_indices, test_indices) in enumerate(splits):
        print("第 %d 折\n" % (k + 1))
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        labels = [0, 1, 2, 3]
        y_one_hot = label_binarize(y_test, classes=labels)

        LGB = lgb.LGBMClassifier(random_state=0, 
                                 learning_rate=lr__[count], 
                                 min_child_samples=mcs__[count],
                                 max_depth=md__[count], 
                                 num_leaves=nl__[count], 
                                 colsample_bytree=cb__[count],
                                 reg_alpha=ra__[count],
                                 reg_lambda=rl__[count],
                                )
        LGB.fit(X_train, y_train, categorical_feature=category_col)
        lgb_feature_importances[f'fold_{k+1}'] = LGB.feature_importances_
        y_pred_prob = LGB.predict_proba(X_test)
        y_pred = LGB.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
    #     G_mean = math.sqrt(recall * specificity)
        print(f" Fold {k + 1} | " )
        print(f" AUC_ROC: { roc_auc * 100}%" )
        print(f" ACC: { acc * 100}%" )
        print(f" F1: { f1 * 100}%" )
        print(f" RECALL: { recall * 100}%" )
        print(f" PRECISION: { precision * 100}%" )
        lgb_f1_scores.append(f1)
        lgb_roc_scores.append(roc_auc)
        lgb_acc_scores.append(acc)
        lgb_recall_scores.append(recall)
        lgb_precision_scores.append(precision)
    count = count + 1
        
    ii.append(i)
    roc_.append(np.mean(lgb_roc_scores))
    acc_.append(np.mean(lgb_acc_scores))
    f1_.append(np.mean(lgb_f1_scores))
    recall_.append(np.mean(lgb_recall_scores))
    precision_.append(np.mean(lgb_precision_scores))
#     print(f'average roc score: {np.mean(lgb_roc_scores)}')
#     print(f'average acc_score: {np.mean(lgb_acc_scores)}')
#     print(f'average f1_score: {np.mean(lgb_f1_scores)}')
#     print(f'average recall_score: {np.mean(lgb_recall_scores)}')
#     print(f'average precision_score: {np.mean(lgb_precision_scores)}')
print(ii)
print(f'average roc score: {roc_}')
print(f'average acc_score: {acc_}')
print(f'average f1_score: {f1_}')
print(f'average recall_score: {recall_}')
print(f'average precision_score: {precision_}')

  0%|          | 0/15 [00:00<?, ?it/s]

第 1 折

 Fold 1 | 
 AUC_ROC: 85.8203385658022%
 ACC: 66.10613370089592%
 F1: 65.38838699531145%
 RECALL: 66.10613370089592%
 PRECISION: 65.2823793900078%
第 2 折

 Fold 2 | 
 AUC_ROC: 85.68169589008116%
 ACC: 65.16884906960718%
 F1: 64.51924163171235%
 RECALL: 65.16884906960718%
 PRECISION: 64.56169792549115%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.02516852805526%
 ACC: 65.26533425223984%
 F1: 64.62498723706126%
 RECALL: 65.26533425223984%
 PRECISION: 64.74274909751783%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.14994448721146%
 ACC: 65.73397656788423%
 F1: 65.12605096813078%
 RECALL: 65.73397656788423%
 PRECISION: 65.2392289353695%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.4614149902275%
 ACC: 65.70640937284631%
 F1: 64.8898762391123%
 RECALL: 65.70640937284631%
 PRECISION: 64.8517698610826%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.00557159858087%
 ACC: 65.48587181254307%
 F1: 64.81125495871167%
 RECALL: 65.48587181254307%
 PRECISION: 64.79430502842466%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.22343578054782%
 ACC: 65.78911095796003%
 F1:

 Fold 5 | 
 AUC_ROC: 91.3584455840734%
 ACC: 74.3762922122674%
 F1: 74.22737908812236%
 RECALL: 74.3762922122674%
 PRECISION: 74.44893176293228%
第 6 折

 Fold 6 | 
 AUC_ROC: 90.87956230673221%
 ACC: 73.63197794624396%
 F1: 73.52204748048162%
 RECALL: 73.63197794624396%
 PRECISION: 73.79384733782442%
第 7 折

 Fold 7 | 
 AUC_ROC: 91.18482103387068%
 ACC: 74.14197105444521%
 F1: 74.04271062976868%
 RECALL: 74.14197105444521%
 PRECISION: 74.15412180342372%
第 8 折

 Fold 8 | 
 AUC_ROC: 90.78365355500469%
 ACC: 73.81116471399035%
 F1: 73.70533198502949%
 RECALL: 73.81116471399035%
 PRECISION: 73.98318955394679%
第 9 折

 Fold 9 | 
 AUC_ROC: 91.10731271740596%
 ACC: 74.41764300482426%
 F1: 74.32500583719265%
 RECALL: 74.41764300482426%
 PRECISION: 74.49408850546993%
第 10 折

 Fold 10 | 
 AUC_ROC: 90.8512531018529%
 ACC: 73.0117160578911%
 F1: 72.90458258545497%
 RECALL: 73.0117160578911%
 PRECISION: 73.17609852108772%
第 1 折

 Fold 1 | 
 AUC_ROC: 90.64748372181587%
 ACC: 73.31495520330806%
 F1: 73.2

 Fold 9 | 
 AUC_ROC: 91.26856793097178%
 ACC: 74.6381805651275%
 F1: 74.54976834048304%
 RECALL: 74.6381805651275%
 PRECISION: 74.72515934847567%
第 10 折

 Fold 10 | 
 AUC_ROC: 90.79404942187132%
 ACC: 73.20468642315645%
 F1: 73.079145525863%
 RECALL: 73.20468642315645%
 PRECISION: 73.32396462487421%
第 1 折

 Fold 1 | 
 AUC_ROC: 90.73903824578521%
 ACC: 73.65954514128188%
 F1: 73.54774305165137%
 RECALL: 73.65954514128188%
 PRECISION: 73.69903428325557%
第 2 折

 Fold 2 | 
 AUC_ROC: 90.93196991696868%
 ACC: 74.0868366643694%
 F1: 73.99087538854741%
 RECALL: 74.0868366643694%
 PRECISION: 74.19219898654369%
第 3 折

 Fold 3 | 
 AUC_ROC: 90.87837464784327%
 ACC: 73.10820124052377%
 F1: 72.9762034399065%
 RECALL: 73.10820124052377%
 PRECISION: 73.22037151859907%
第 4 折

 Fold 4 | 
 AUC_ROC: 91.2411354025897%
 ACC: 74.15575465196417%
 F1: 74.07707562315721%
 RECALL: 74.15575465196417%
 PRECISION: 74.24850922983683%
第 5 折

 Fold 5 | 
 AUC_ROC: 91.35262220400854%
 ACC: 74.16953824948311%
 F1: 73.968

In [12]:
print(roc_)
print(acc_)
print(f1_)
print(recall_)
print(precision_)

[0.8601453118149568, 0.8999842308420616, 0.9092704164768263, 0.9090892616338792, 0.9090050852010017, 0.9099081151095723, 0.9102302534054925, 0.9101410294217785, 0.910043345061529, 0.9102562313363987, 0.9106728696836928, 0.9102607249616751, 0.9104563986968831, 0.9111645955505934, 0.9118680083086895]
[0.6567470709855272, 0.7219297036526534, 0.7380427291523087, 0.7382081323225361, 0.7387594762232942, 0.7381116471399036, 0.7381943487250172, 0.7389937973811165, 0.7387181254307373, 0.7408545830461751, 0.7406202618883528, 0.7404824259131633, 0.7411440385940731, 0.7421364576154377, 0.7422880771881462]
[0.6501488484228044, 0.7202689503282997, 0.73687462273076, 0.7368811125741722, 0.7375806415903579, 0.7370210692039466, 0.7370737907086828, 0.7377599631878574, 0.7376009247513489, 0.7397021180819152, 0.7393878293406534, 0.7393829281732772, 0.7401266858378854, 0.7409628991144748, 0.7411000976158341]
[0.6567470709855272, 0.7219297036526534, 0.7380427291523087, 0.7382081323225361, 0.7387594762232942,

In [13]:
print(roc_)
print(acc_)
print(f1_)
print(recall_)
print(precision_)

[0.8601453118149568, 0.8999842308420616, 0.9092704164768263, 0.9090892616338792, 0.9090050852010017, 0.9099081151095723, 0.9102302534054925, 0.9101410294217785, 0.910043345061529, 0.9102562313363987, 0.9106728696836928, 0.9102607249616751, 0.9104563986968831, 0.9111645955505934, 0.9118680083086895]
[0.6567470709855272, 0.7219297036526534, 0.7380427291523087, 0.7382081323225361, 0.7387594762232942, 0.7381116471399036, 0.7381943487250172, 0.7389937973811165, 0.7387181254307373, 0.7408545830461751, 0.7406202618883528, 0.7404824259131633, 0.7411440385940731, 0.7421364576154377, 0.7422880771881462]
[0.6501488484228044, 0.7202689503282997, 0.73687462273076, 0.7368811125741722, 0.7375806415903579, 0.7370210692039466, 0.7370737907086828, 0.7377599631878574, 0.7376009247513489, 0.7397021180819152, 0.7393878293406534, 0.7393829281732772, 0.7401266858378854, 0.7409628991144748, 0.7411000976158341]
[0.6567470709855272, 0.7219297036526534, 0.7380427291523087, 0.7382081323225361, 0.7387594762232942,

In [14]:
#导入库
import matplotlib.pyplot as plt
%matplotlib
#设定画布。dpi越大图越清晰，绘图时间越久
fig=plt.figure(figsize=(10, 4), dpi=200)
#导入数据
x = range(0, 1500, 100)
x_ = range(0, 1500, 200)
y_1 = np.arange(0.86, 0.91, 0.01)
y_2 = np.arange(0.65, 0.76, 0.01)
y1 = roc_
y2 = acc_
y3 = f1_
y4 = recall_
y5 = precision_
#绘图命令
plt.subplot(1,2,1) # 子图的行、列、索引
plt.plot(x, y1, lw=1, ls='-', c='b', alpha=0.5, label='AUC')

plt.legend()  
plt.xticks(x_)
plt.yticks(y_1)
plt.title("Bag of word")
plt.xlabel("Feature dimension") 
plt.ylabel("Performance")

plt.subplot(1,2,2) # 子图的行、列、索引
plt.plot(x, y2, lw=1, ls='-', c='r', alpha=0.5, label='Accuracy')
plt.plot(x, y3, lw=1, ls='-', c='g', alpha=0.5, label='F1-score')
plt.plot(x, y4, lw=1, ls='-', c='k', alpha=0.5, label='Sensitivity')
plt.plot(x, y5, lw=1, ls='-', c='m', alpha=0.5, label='Precision')

plt.legend()  
plt.xticks(x_)
plt.yticks(y_2)
plt.title("Bag of word")
plt.xlabel("Feature dimension") 
plt.ylabel("Performance")

plt.show()

Using matplotlib backend: TkAgg


In [16]:
print(max(roc_))
print(max(acc_))
print(max(f1_))
print(max(recall_))
print(max(precision_))

0.9118680083086895
0.7422880771881462
0.7411000976158341
0.7422880771881462
0.7432867824614292
