In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.book import *
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm_notebook
import joblib
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import defaultdict
from gensim import corpora
from gensim import models
import warnings
warnings.filterwarnings('ignore')

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [4]:
def review_to_wordlist( raw_review, remove_stopwords=False ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    if remove_stopwords:
        stops = set(stopwords.words("english"))                  
        words = [w for w in words if not w in stops]   
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return words

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence,remove_stopwords ))
    return sentences

In [5]:
data = pd.read_csv("../Bag of Word/data_txt.csv")
for row in tqdm_notebook(range(data.summary.shape[0])):
    data.summary[row] = data.summary[row].lower()
# print(data.summary)
print(data)

  0%|          | 0/72550 [00:00<?, ?it/s]

           city  latitude  longitude                 attacktype1_txt  \
0      0.181904  0.681564   0.204755                   Armed Assault   
1      0.688351  0.688223   0.106546               Bombing/Explosion   
2      0.558561  0.732945   0.204053  Facility/Infrastructure Attack   
3      0.558561  0.732914   0.204130  Facility/Infrastructure Attack   
4      0.249211  0.704869   0.158102  Facility/Infrastructure Attack   
...         ...       ...        ...                             ...   
72545  0.009873  0.477141   0.603580               Bombing/Explosion   
72546  0.142187  0.611361   0.713903  Facility/Infrastructure Attack   
72547  0.787440  0.497927   0.597584               Bombing/Explosion   
72548  0.420521  0.660566   0.675203                   Armed Assault   
72549  0.974123  0.687351   0.180508                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1              

In [6]:
data.to_csv('data_txt.csv', index=False, encoding='utf_8_sig')

In [7]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in data["summary"]:
    sentences += review_to_sentences(review, tokenizer)

print(len(sentences))

Parsing sentences from training set
185471


In [8]:
print( "Creating the word2vec of words...\n")
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
name = []
from gensim.models import word2vec
# num_features = 100    # Word vector dimensionality                      
# min_word_count = 4   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 5          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
for i in tqdm_notebook(range(100, 1500, 100)):
    model = word2vec.Word2Vec(sentences, 
#                               workers = num_workers, 
                              vector_size = i, 
#                               min_count = min_word_count,
#                               window = context, 
#                               sample = downsampling
                             )
    model_name = str(i) + "features_word2vec"
    model.save(model_name)
    name.append(model_name)
print(name)

Creating the word2vec of words...



  0%|          | 0/14 [00:00<?, ?it/s]

2023-09-17 09:52:38,699 : INFO : collecting all words and their counts
2023-09-17 09:52:38,700 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-09-17 09:52:38,735 : INFO : PROGRESS: at sentence #10000, processed 177801 words, keeping 12644 word types
2023-09-17 09:52:38,770 : INFO : PROGRESS: at sentence #20000, processed 353986 words, keeping 18738 word types
2023-09-17 09:52:38,809 : INFO : PROGRESS: at sentence #30000, processed 540343 words, keeping 24165 word types
2023-09-17 09:52:38,838 : INFO : PROGRESS: at sentence #40000, processed 680196 words, keeping 27078 word types
2023-09-17 09:52:38,865 : INFO : PROGRESS: at sentence #50000, processed 809119 words, keeping 29534 word types
2023-09-17 09:52:38,891 : INFO : PROGRESS: at sentence #60000, processed 934692 words, keeping 31772 word types
2023-09-17 09:52:38,917 : INFO : PROGRESS: at sentence #70000, processed 1060909 words, keeping 33895 word types
2023-09-17 09:52:38,944 : INFO : PROGRESS: at

2023-09-17 09:52:44,722 : INFO : PROGRESS: at sentence #150000, processed 2006321 words, keeping 50381 word types
2023-09-17 09:52:44,745 : INFO : PROGRESS: at sentence #160000, processed 2123258 words, keeping 52895 word types
2023-09-17 09:52:44,768 : INFO : PROGRESS: at sentence #170000, processed 2238039 words, keeping 55368 word types
2023-09-17 09:52:44,787 : INFO : PROGRESS: at sentence #180000, processed 2353367 words, keeping 57813 word types
2023-09-17 09:52:44,800 : INFO : collected 58963 word types from a corpus of 2416409 raw words and 185471 sentences
2023-09-17 09:52:44,801 : INFO : Creating a fresh vocabulary
2023-09-17 09:52:44,858 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 11477 unique words (19.46% of original 58963, drops 47486)', 'datetime': '2023-09-17T09:52:44.858990', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}


2023-09-17 09:52:50,885 : INFO : deleting the raw counts dictionary of 58963 items
2023-09-17 09:52:50,887 : INFO : sample=0.001 downsamples 49 most-common words
2023-09-17 09:52:50,888 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 1454179.8130924501 word corpus (62.0%% of prior 2346601)', 'datetime': '2023-09-17T09:52:50.888481', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
2023-09-17 09:52:50,999 : INFO : estimated required memory for 11477 words and 300 dimensions: 33283300 bytes
2023-09-17 09:52:50,999 : INFO : resetting layer weights
2023-09-17 09:52:51,015 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-09-17T09:52:51.015169', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'build_vocab'}
20

2023-09-17 09:52:59,162 : INFO : EPOCH 0 - PROGRESS: at 59.96% examples, 964390 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:52:59,622 : INFO : EPOCH 0: training on 2416409 raw words (1454441 effective words) took 1.5s, 993800 effective words/s
2023-09-17 09:53:00,639 : INFO : EPOCH 1 - PROGRESS: at 64.01% examples, 1016709 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:53:01,035 : INFO : EPOCH 1: training on 2416409 raw words (1454989 effective words) took 1.4s, 1039452 effective words/s
2023-09-17 09:53:02,042 : INFO : EPOCH 2 - PROGRESS: at 70.68% examples, 1097944 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:53:02,354 : INFO : EPOCH 2: training on 2416409 raw words (1454452 effective words) took 1.3s, 1107581 effective words/s
2023-09-17 09:53:03,361 : INFO : EPOCH 3 - PROGRESS: at 55.82% examples, 914539 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:53:03,968 : INFO : EPOCH 3: training on 2416409 raw words (1454151 effective words) took 1.6s, 904046 effective words/s
2023-09-

2023-09-17 09:53:14,930 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=11477, vector_size=500, alpha=0.025>', 'datetime': '2023-09-17T09:53:14.930824', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2023-09-17 09:53:14,934 : INFO : Word2Vec lifecycle event {'fname_or_handle': '500features_word2vec', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-09-17T09:53:14.934812', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'saving'}
2023-09-17 09:53:14,934 : INFO : not storing attribute cum_table
2023-09-17 09:53:15,023 : INFO : saved 500features_word2vec
2023-09-17 09:53:15,026 : INFO : collecting all words and their counts
2023-09-17 09:53:15,027 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-0

2023-09-17 09:53:25,510 : INFO : PROGRESS: at sentence #40000, processed 680196 words, keeping 27078 word types
2023-09-17 09:53:25,536 : INFO : PROGRESS: at sentence #50000, processed 809119 words, keeping 29534 word types
2023-09-17 09:53:25,563 : INFO : PROGRESS: at sentence #60000, processed 934692 words, keeping 31772 word types
2023-09-17 09:53:25,588 : INFO : PROGRESS: at sentence #70000, processed 1060909 words, keeping 33895 word types
2023-09-17 09:53:25,613 : INFO : PROGRESS: at sentence #80000, processed 1185015 words, keeping 35949 word types
2023-09-17 09:53:25,635 : INFO : PROGRESS: at sentence #90000, processed 1305579 words, keeping 37785 word types
2023-09-17 09:53:25,657 : INFO : PROGRESS: at sentence #100000, processed 1426624 words, keeping 39754 word types
2023-09-17 09:53:25,678 : INFO : PROGRESS: at sentence #110000, processed 1544737 words, keeping 41751 word types
2023-09-17 09:53:25,699 : INFO : PROGRESS: at sentence #120000, processed 1663639 words, keeping 

2023-09-17 09:53:36,843 : INFO : PROGRESS: at sentence #150000, processed 2006321 words, keeping 50381 word types
2023-09-17 09:53:36,865 : INFO : PROGRESS: at sentence #160000, processed 2123258 words, keeping 52895 word types
2023-09-17 09:53:36,885 : INFO : PROGRESS: at sentence #170000, processed 2238039 words, keeping 55368 word types
2023-09-17 09:53:36,908 : INFO : PROGRESS: at sentence #180000, processed 2353367 words, keeping 57813 word types
2023-09-17 09:53:36,920 : INFO : collected 58963 word types from a corpus of 2416409 raw words and 185471 sentences
2023-09-17 09:53:36,922 : INFO : Creating a fresh vocabulary
2023-09-17 09:53:36,981 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 11477 unique words (19.46% of original 58963, drops 47486)', 'datetime': '2023-09-17T09:53:36.981485', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}


2023-09-17 09:53:50,647 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 2346601 word corpus (97.11% of original 2416409, drops 69808)', 'datetime': '2023-09-17T09:53:50.647909', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
2023-09-17 09:53:50,715 : INFO : deleting the raw counts dictionary of 58963 items
2023-09-17 09:53:50,717 : INFO : sample=0.001 downsamples 49 most-common words
2023-09-17 09:53:50,718 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 1454179.8130924501 word corpus (62.0%% of prior 2346601)', 'datetime': '2023-09-17T09:53:50.718720', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
2023-09-17 09:53:50,839 : INFO : estimated required memory for 11477 words and 900 dimensions: 88372900 

2023-09-17 09:54:06,883 : INFO : estimated required memory for 11477 words and 1000 dimensions: 97554500 bytes
2023-09-17 09:54:06,884 : INFO : resetting layer weights
2023-09-17 09:54:06,936 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-09-17T09:54:06.936592', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'build_vocab'}
2023-09-17 09:54:06,937 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 11477 vocabulary and 1000 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-09-17T09:54:06.937602', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
2023-09-17 09:54:07,951 : INFO : EPOCH 0 - PROGRESS: at 21.94% examples, 454832 words/s, in_qsize 5, out_qsize 0
2023-09

2023-09-17 09:54:24,492 : INFO : EPOCH 0 - PROGRESS: at 19.81% examples, 420448 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:25,501 : INFO : EPOCH 0 - PROGRESS: at 56.28% examples, 454372 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:26,505 : INFO : EPOCH 0 - PROGRESS: at 95.01% examples, 459405 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:26,623 : INFO : EPOCH 0: training on 2416409 raw words (1454397 effective words) took 3.2s, 461668 effective words/s
2023-09-17 09:54:27,629 : INFO : EPOCH 1 - PROGRESS: at 20.24% examples, 434755 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:28,636 : INFO : EPOCH 1 - PROGRESS: at 55.82% examples, 456240 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:29,652 : INFO : EPOCH 1 - PROGRESS: at 96.34% examples, 466436 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:29,727 : INFO : EPOCH 1: training on 2416409 raw words (1454606 effective words) took 3.1s, 469267 effective words/s
2023-09-17 09:54:30,741 : INFO : EPOCH 2 - PROGRESS: a

2023-09-17 09:54:42,239 : INFO : EPOCH 0 - PROGRESS: at 20.24% examples, 433631 words/s, in_qsize 6, out_qsize 0
2023-09-17 09:54:43,245 : INFO : EPOCH 0 - PROGRESS: at 54.92% examples, 450111 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:44,247 : INFO : EPOCH 0 - PROGRESS: at 93.58% examples, 457048 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:44,405 : INFO : EPOCH 0: training on 2416409 raw words (1453628 effective words) took 3.2s, 458709 effective words/s
2023-09-17 09:54:45,415 : INFO : EPOCH 1 - PROGRESS: at 21.10% examples, 445162 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:46,441 : INFO : EPOCH 1 - PROGRESS: at 56.74% examples, 456482 words/s, in_qsize 4, out_qsize 1
2023-09-17 09:54:47,444 : INFO : EPOCH 1 - PROGRESS: at 95.89% examples, 463036 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:54:47,537 : INFO : EPOCH 1: training on 2416409 raw words (1453834 effective words) took 3.1s, 464865 effective words/s
2023-09-17 09:54:48,557 : INFO : EPOCH 2 - PROGRESS: a

2023-09-17 09:54:59,586 : INFO : EPOCH 0 - PROGRESS: at 17.37% examples, 385820 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:00,588 : INFO : EPOCH 0 - PROGRESS: at 46.42% examples, 394205 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:01,616 : INFO : EPOCH 0 - PROGRESS: at 78.19% examples, 391138 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:02,293 : INFO : EPOCH 0: training on 2416409 raw words (1454435 effective words) took 3.7s, 391113 effective words/s
2023-09-17 09:55:03,333 : INFO : EPOCH 1 - PROGRESS: at 13.01% examples, 282012 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:04,342 : INFO : EPOCH 1 - PROGRESS: at 34.25% examples, 309351 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:05,343 : INFO : EPOCH 1 - PROGRESS: at 63.57% examples, 333367 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:06,351 : INFO : EPOCH 1 - PROGRESS: at 96.34% examples, 348343 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:06,461 : INFO : EPOCH 1: training on 2416409 raw words (14

2023-09-17 09:55:21,765 : INFO : EPOCH 0 - PROGRESS: at 16.46% examples, 360602 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:22,786 : INFO : EPOCH 0 - PROGRESS: at 45.09% examples, 378973 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:23,794 : INFO : EPOCH 0 - PROGRESS: at 78.19% examples, 389252 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:24,448 : INFO : EPOCH 0: training on 2416409 raw words (1453630 effective words) took 3.7s, 391769 effective words/s
2023-09-17 09:55:25,463 : INFO : EPOCH 1 - PROGRESS: at 16.75% examples, 374391 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:26,474 : INFO : EPOCH 1 - PROGRESS: at 45.53% examples, 387105 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:27,489 : INFO : EPOCH 1 - PROGRESS: at 76.32% examples, 384119 words/s, in_qsize 5, out_qsize 0
2023-09-17 09:55:28,245 : INFO : EPOCH 1: training on 2416409 raw words (1453829 effective words) took 3.8s, 383505 effective words/s
2023-09-17 09:55:29,257 : INFO : EPOCH 2 - PROGRESS: a

['100features_word2vec', '200features_word2vec', '300features_word2vec', '400features_word2vec', '500features_word2vec', '600features_word2vec', '700features_word2vec', '800features_word2vec', '900features_word2vec', '1000features_word2vec', '1100features_word2vec', '1200features_word2vec', '1300features_word2vec', '1400features_word2vec']


In [9]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    #index2word_set = set(model.wv.index2word)
    index2word_set = set(model.wv.index_to_key)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model.wv[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        if counter%1000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [10]:
from gensim.models import Word2Vec
ii = []
for i in tqdm_notebook(range(100, 1500, 100)):
    data = pd.read_csv("data_txt.csv")
    model_n = str(i) + "features_word2vec"
    model = Word2Vec.load(model_n)
    clean_train_reviews = []
    for review in data["summary"]:
        clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ))
    trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, i )
    
    column_summary = []
    for j in range(trainDataVecs.shape[1]):
        col = "summary_" + str(j)
        column_summary.append(col)
    data_summary = pd.DataFrame(trainDataVecs, columns=column_summary)
    data = pd.concat([data, data_summary], axis=1, ignore_index=False)
    data.drop(columns=['summary'], axis=1, inplace=True)
    data_n = "data_w2v_" + str(i) + ".csv"
    data.to_csv(data_n, index=False, encoding='utf_8_sig')
    ii.append(data_n)

print(ii)

  0%|          | 0/14 [00:00<?, ?it/s]

2023-09-17 09:55:40,658 : INFO : loading Word2Vec object from 100features_word2vec
2023-09-17 09:55:40,677 : INFO : loading wv recursively from 100features_word2vec.wv.* with mmap=None
2023-09-17 09:55:40,680 : INFO : setting ignored attribute cum_table to None
2023-09-17 09:55:40,790 : INFO : Word2Vec lifecycle event {'fname': '100features_word2vec', 'datetime': '2023-09-17T09:55:40.790981', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 09:56:55,052 : INFO : loading Word2Vec object from 200features_word2vec
2023-09-17 09:56:55,072 : INFO : loading wv recursively from 200features_word2vec.wv.* with mmap=None
2023-09-17 09:56:55,074 : INFO : setting ignored attribute cum_table to None
2023-09-17 09:56:55,182 : INFO : Word2Vec lifecycle event {'fname': '200features_word2vec', 'datetime': '2023-09-17T09:56:55.182286', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 09:58:15,237 : INFO : loading Word2Vec object from 300features_word2vec
2023-09-17 09:58:15,263 : INFO : loading wv recursively from 300features_word2vec.wv.* with mmap=None
2023-09-17 09:58:15,264 : INFO : setting ignored attribute cum_table to None
2023-09-17 09:58:15,372 : INFO : Word2Vec lifecycle event {'fname': '300features_word2vec', 'datetime': '2023-09-17T09:58:15.372835', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 09:59:41,942 : INFO : loading Word2Vec object from 400features_word2vec
2023-09-17 09:59:42,033 : INFO : loading wv recursively from 400features_word2vec.wv.* with mmap=None
2023-09-17 09:59:42,034 : INFO : setting ignored attribute cum_table to None
2023-09-17 09:59:42,148 : INFO : Word2Vec lifecycle event {'fname': '400features_word2vec', 'datetime': '2023-09-17T09:59:42.148157', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:01:21,178 : INFO : loading Word2Vec object from 500features_word2vec
2023-09-17 10:01:21,216 : INFO : loading wv recursively from 500features_word2vec.wv.* with mmap=None
2023-09-17 10:01:21,218 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:01:21,332 : INFO : Word2Vec lifecycle event {'fname': '500features_word2vec', 'datetime': '2023-09-17T10:01:21.332758', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:03:07,206 : INFO : loading Word2Vec object from 600features_word2vec
2023-09-17 10:03:07,253 : INFO : loading wv recursively from 600features_word2vec.wv.* with mmap=None
2023-09-17 10:03:07,254 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:03:07,365 : INFO : Word2Vec lifecycle event {'fname': '600features_word2vec', 'datetime': '2023-09-17T10:03:07.365381', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:05:02,295 : INFO : loading Word2Vec object from 700features_word2vec
2023-09-17 10:05:02,344 : INFO : loading wv recursively from 700features_word2vec.wv.* with mmap=None
2023-09-17 10:05:02,345 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:05:02,459 : INFO : Word2Vec lifecycle event {'fname': '700features_word2vec', 'datetime': '2023-09-17T10:05:02.459303', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:07:05,779 : INFO : loading Word2Vec object from 800features_word2vec
2023-09-17 10:07:05,839 : INFO : loading wv recursively from 800features_word2vec.wv.* with mmap=None
2023-09-17 10:07:05,840 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:07:05,978 : INFO : Word2Vec lifecycle event {'fname': '800features_word2vec', 'datetime': '2023-09-17T10:07:05.978864', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:09:17,889 : INFO : loading Word2Vec object from 900features_word2vec
2023-09-17 10:09:17,958 : INFO : loading wv recursively from 900features_word2vec.wv.* with mmap=None
2023-09-17 10:09:17,960 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:09:18,079 : INFO : Word2Vec lifecycle event {'fname': '900features_word2vec', 'datetime': '2023-09-17T10:09:18.079642', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:11:39,930 : INFO : loading Word2Vec object from 1000features_word2vec
2023-09-17 10:11:39,948 : INFO : loading wv recursively from 1000features_word2vec.wv.* with mmap=None
2023-09-17 10:11:39,950 : INFO : loading vectors from 1000features_word2vec.wv.vectors.npy with mmap=None
2023-09-17 10:11:40,050 : INFO : loading syn1neg from 1000features_word2vec.syn1neg.npy with mmap=None
2023-09-17 10:11:40,073 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:11:40,187 : INFO : Word2Vec lifecycle event {'fname': '1000features_word2vec', 'datetime': '2023-09-17T10:11:40.187266', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:14:15,645 : INFO : loading Word2Vec object from 1100features_word2vec
2023-09-17 10:14:15,659 : INFO : loading wv recursively from 1100features_word2vec.wv.* with mmap=None
2023-09-17 10:14:15,660 : INFO : loading vectors from 1100features_word2vec.wv.vectors.npy with mmap=None
2023-09-17 10:14:15,686 : INFO : loading syn1neg from 1100features_word2vec.syn1neg.npy with mmap=None
2023-09-17 10:14:15,710 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:14:15,830 : INFO : Word2Vec lifecycle event {'fname': '1100features_word2vec', 'datetime': '2023-09-17T10:14:15.830515', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:16:54,548 : INFO : loading Word2Vec object from 1200features_word2vec
2023-09-17 10:16:54,562 : INFO : loading wv recursively from 1200features_word2vec.wv.* with mmap=None
2023-09-17 10:16:54,563 : INFO : loading vectors from 1200features_word2vec.wv.vectors.npy with mmap=None
2023-09-17 10:16:54,604 : INFO : loading syn1neg from 1200features_word2vec.syn1neg.npy with mmap=None
2023-09-17 10:16:54,628 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:16:54,742 : INFO : Word2Vec lifecycle event {'fname': '1200features_word2vec', 'datetime': '2023-09-17T10:16:54.742724', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:19:41,882 : INFO : loading Word2Vec object from 1300features_word2vec
2023-09-17 10:19:42,103 : INFO : loading wv recursively from 1300features_word2vec.wv.* with mmap=None
2023-09-17 10:19:42,104 : INFO : loading vectors from 1300features_word2vec.wv.vectors.npy with mmap=None
2023-09-17 10:19:42,133 : INFO : loading syn1neg from 1300features_word2vec.syn1neg.npy with mmap=None
2023-09-17 10:19:42,161 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:19:42,277 : INFO : Word2Vec lifecycle event {'fname': '1300features_word2vec', 'datetime': '2023-09-17T10:19:42.277489', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

2023-09-17 10:22:33,916 : INFO : loading Word2Vec object from 1400features_word2vec
2023-09-17 10:22:33,965 : INFO : loading wv recursively from 1400features_word2vec.wv.* with mmap=None
2023-09-17 10:22:33,966 : INFO : loading vectors from 1400features_word2vec.wv.vectors.npy with mmap=None
2023-09-17 10:22:34,524 : INFO : loading syn1neg from 1400features_word2vec.syn1neg.npy with mmap=None
2023-09-17 10:22:34,933 : INFO : setting ignored attribute cum_table to None
2023-09-17 10:22:35,050 : INFO : Word2Vec lifecycle event {'fname': '1400features_word2vec', 'datetime': '2023-09-17T10:22:35.050348', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


Review 0 of 72550
Review 1000 of 72550
Review 2000 of 72550
Review 3000 of 72550
Review 4000 of 72550
Review 5000 of 72550
Review 6000 of 72550
Review 7000 of 72550
Review 8000 of 72550
Review 9000 of 72550
Review 10000 of 72550
Review 11000 of 72550
Review 12000 of 72550
Review 13000 of 72550
Review 14000 of 72550
Review 15000 of 72550
Review 16000 of 72550
Review 17000 of 72550
Review 18000 of 72550
Review 19000 of 72550
Review 20000 of 72550
Review 21000 of 72550
Review 22000 of 72550
Review 23000 of 72550
Review 24000 of 72550
Review 25000 of 72550
Review 26000 of 72550
Review 27000 of 72550
Review 28000 of 72550
Review 29000 of 72550
Review 30000 of 72550
Review 31000 of 72550
Review 32000 of 72550
Review 33000 of 72550
Review 34000 of 72550
Review 35000 of 72550
Review 36000 of 72550
Review 37000 of 72550
Review 38000 of 72550
Review 39000 of 72550
Review 40000 of 72550
Review 41000 of 72550
Review 42000 of 72550
Review 43000 of 72550
Review 44000 of 72550
Review 45000 of 72550
R

In [11]:
data = pd.read_csv("data_txt.csv")
data.drop(columns=["summary"], axis=1, inplace=True)
print(data)
data.to_csv('data_w2v_0.csv', index=False, encoding='utf_8_sig')

           city  latitude  longitude                 attacktype1_txt  \
0      0.181904  0.681564   0.204755                   Armed Assault   
1      0.688351  0.688223   0.106546               Bombing/Explosion   
2      0.558561  0.732945   0.204053  Facility/Infrastructure Attack   
3      0.558561  0.732914   0.204130  Facility/Infrastructure Attack   
4      0.249211  0.704869   0.158102  Facility/Infrastructure Attack   
...         ...       ...        ...                             ...   
72545  0.009873  0.477141   0.603580               Bombing/Explosion   
72546  0.142187  0.611361   0.713903  Facility/Infrastructure Attack   
72547  0.787440  0.497927   0.597584               Bombing/Explosion   
72548  0.420521  0.660566   0.675203                   Armed Assault   
72549  0.974123  0.687351   0.180508                   Assassination   

                                        targsubtype1_txt  \
0        Police Building (headquarters, station, school)   
1              

In [12]:
def fitness_func(solution, solution_idx):
    # 从解决方案中提取超参数
    learning_rate = solution[0]
    min_child_samples = int(solution[1])
    max_depth = int(solution[2])
    num_leaves = int(solution[3])
    colsample_bytree = (solution[4])
    reg_alpha = solution[5]
    reg_lambda = solution[6]
    
    print(learning_rate, min_child_samples, max_depth, num_leaves,
          colsample_bytree, reg_alpha, reg_lambda)
    # 定义LightBGM的函数
    LGB = lgb.LGBMClassifier(learning_rate=learning_rate, # 学习率
                             min_child_samples=min_child_samples,
                             max_depth=max_depth, # 树的最大深度
                             num_leaves=num_leaves, 
                             colsample_bytree=colsample_bytree,
                             reg_alpha=reg_alpha,
                             reg_lambda=reg_lambda,
                             random_state=0 # 随机种子
                            )

    # 利用训练数据训练LightLGB分类器
    LGB.fit(X_train, y_train, categorical_feature=category_col)
    # 对测试数据进行预测
#     y_pred_prob = LGB.predict_proba(X_test)
    y_pred = LGB.predict(X_test)
    # 计算准确率
#     acc = accuracy_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
#     precision = precision_score(y_test, y_pred, average='weighted')
#     roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average="weighted")
#     print('当前准确率：', precision)
    # 返回适应函数分数（准确性）
    fitness = recall
    return fitness

In [13]:
import pygad

max__ = []
lr__ = []
mcs__ = []
md__ = []
nl__ = []
cb__ = []
ra__ = []
rl__ = []

for i in tqdm_notebook(range(0, 1500, 100)):
    name = "data_w2v_" + str(i) + '.csv'
    data = pd.read_csv(name)
    
    max_ = {'max': 0, 
           'learning_rate': 0,
           'min_child_samples': 0,              
           'max_depth': 0,
           'num_leaves': 0, 
           'colsample_bytree': 0,
           'reg_alpha': 0,
           'reg_lambda': 0}
    
    category_col = ['attacktype1_txt', 'targsubtype1_txt', 'weapsubtype1_txt']
    data[category_col] = data[category_col].astype('category')
    X = data.drop(columns=['risk'], axis=1)
    y = data['risk']
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    splits = kf.split(X, y)
    
    for k, (train_indices, test_indices) in enumerate(splits):
        print("第 %d 折\n" % (k + 1))
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        labels = [0, 1, 2, 3]
        y_one_hot = label_binarize(y_test, classes=labels)
        
        param_list = [
        {'low': 0.01, 'high': 1}, # learning_rate 
        {'low': 5, 'high': 50}, # min_child_samples
        {'low': 1, 'high': 15}, # max_depth 
        {'low': 5, 'high':100}, #num_leaves 
        {'low': 0.1, 'high': 1}, #colsample_bytree
        {'low': 0, 'high': 100}, # reg_alpha
        {'low': 0, 'high': 100}, #reg_lambda
        ]
    
        # 定义遗传算法
        ga_instance = pygad.GA(num_generations=4, # 遗传算法的代数 
                               num_parents_mating=2, # 每代选择交叉的父代数量 
                               fitness_func=fitness_func, # 适应函数 
        #                            initial_population=[20, 8],
                               sol_per_pop=3, # 种群中的解决方案数量 
                               num_genes=len(param_list), # 解决方案中的基因数量（即超参数数量） 
                               gene_type=float, # 基因类型（即超参数类型） 
                               gene_space=param_list, # 基因空间（即超参数范围） 
                               parent_selection_type='rws', # 父代选择类型 
                               keep_parents=1, # 保留的父代数量 
                               crossover_type='uniform', # 交叉类型 
                               crossover_probability=0.6,
                               mutation_type='random', # 变异类型 
                               mutation_probability=0.01,
        #                            mutation_percent_genes=10 # 变异基因百分比
                               random_seed=0
                              )
        # 开始遗传算法
        ga_instance.run()
        
        # 获取最优超参数组合
        best_solution, best_fitness, best_solution_idx = ga_instance.best_solution()
        best_learning_rate = best_solution[0] 
        best_min_child_samples = int(best_solution[1])
        best_max_depth = int(best_solution[2])
        best_num_leaves = int(best_solution[3])
        best_colsample_bytree = (best_solution[4])
        best_reg_alpha = best_solution[5]
        best_reg_lambda = best_solution[6]

        # 打印最佳解决方案和最佳适应值
        print('Best solution is {solution} with fitness value {fitness}'.format(solution=best_solution, fitness=best_fitness)) 
        print('Best learning rate is {lr}'.format(lr=best_learning_rate)) 
        print('Best min child samples is {mcs}'.format(mcs=best_min_child_samples))
        print('Best max depth is {md}'.format(md=best_max_depth)) 
        print('Best num leaves is {nl}'.format(nl=best_num_leaves))  
        print('Best colsample bytree is {cb}'.format(cb=best_colsample_bytree))
        print('Best reg_alpha is {al}'.format(al=best_reg_alpha))
        print('Best reg_lambda is {la}'.format(la=best_reg_lambda))
        
        
        

        if best_fitness > max_['max']:
            max_['max'] = best_fitness
            max_['learning_rate'] = best_learning_rate 
            max_['min_child_samples'] = best_min_child_samples
            max_['max_depth'] = best_max_depth
            max_['num_leaves'] = best_num_leaves
            max_['colsample_bytree'] = best_colsample_bytree
            max_['reg_alpha'] = best_reg_alpha
            max_['reg_lambda'] = best_reg_lambda
            
    max__.append(max_['max'])
    lr__.append(max_['learning_rate'])
    mcs__.append(max_['min_child_samples'])
    md__.append(max_['max_depth'])
    nl__.append(max_['num_leaves'])
    cb__.append(max_['colsample_bytree'])
    ra__.append(max_['reg_alpha'])
    rl__.append(max_['reg_lambda'])
        
print(max__, lr__, mcs__, md__, nl__, cb__, ra__, rl__)

  0%|          | 0/15 [00:00<?, ?it/s]

第 1 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.76167462 64.58941131
 43.75872113] with fitness value 0.6610613370089593
Best learning rate is 0.5533253688880515
B

0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
Best solution is [ 0.89285527 48.36482422  6.36818126 56.76390238  0.76167462 56.80445611
 92.55966383] with fitness value 0.6526533425223984
Best learning rate is 0.892855270774259
Best min child samples is 48
Best max depth is 6
Best num leaves is 56
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 56.80445610939323
Best reg_lambda is 92.5596638292661
第 9 折

0.5533253688880515 37 9 56 0.4812893194050143 64.5894113

0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 48 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7345279117849759
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43

0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 37 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
0.892855270774259 37 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7230875258442453
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 3 折

0.5533253688880515 37 9 56 0.4812893194050143 64.589411306

0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 80 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 80 0.7616746199103354 64.58941130666561 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.76167462 64.58941131
 92.55966383] with fitness value 0.7376981392143349
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 80
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 64.58941130666561
Best reg_lambda is 92.5596638292661
第 10 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610

0.892855270774259 48 6 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.4812893194050143 64.58941130666561 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7251550654720882
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 7 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.8044561

0.892855270774259 37 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 37 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 37 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7243280496209511
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 4 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.5894113066

0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
0.892855270774259 48 6 56 0.5760054277776141 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7277739490006891
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 1 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.5894113066

Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7381116471399035
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 8 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 43.75872112626925
0.892855270774259 48 6 56 0.4812893194050143 64.5894113

Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7411440385940731
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 5 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 64.5894113

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
Best solution is [ 0.89285527 48.36482422  6.36818126 56.76390238  0.76167462 56.80445611
 92.55966383] with fitness value 0.7346657477601654
Best learning rate is 0.892855270774259
Best mi

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7341144038594073
Best learning rate is 0.5533253688880515
Best min c

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 37 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
0.892855270774259 37 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7342522398345969
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 7 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.804456109

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 43.75872112626925
0.892855270774259 48 6 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.4812893194050143 64.58941130666561 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.729014472777395
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 4 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.8044561

0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7286009648518263
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 1 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.5894113066

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.5533253688880515 48 9 80 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 48 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.7616746199103354 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
Best solution is [ 0.89285527 48.36482422  6.36818126 80.21387862  0.57600543 56.80445611
 92.55966383] with fitness value 0.7363197794624396
Best learning rate is 0.892855270774259
Best 

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 37 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
0.892855270774259 37 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7400413507925568
Best learning rate is 0.5533253688880515
Best min c

0.892855270774259 37 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.7616746199103354 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
Best solution is [ 0.89285527 37.18352149  6.36818126 80.21387862  0.76167462 56.80445611
 92.55966383] with fitness value 0.732460372157133
Best learning rate is 0.892855270774259
Best min child samples is 37
Best max depth is 6
Best num leaves is 80
Best colsample bytree is 0.7616746199103354
Best reg_alpha is 56.80445610939323
Best reg_lambda is 92.5596638292661
第 3 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482

0.892855270774259 48 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 92.5596638292661
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 92.5596638292661
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7381116471399035
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 10 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482

0.892855270774259 48 6 56 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 56.80445610939323 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 43.75872112626925
0.892855270774259 48 6 56 0.7616746199103354 64.58941130666561 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 56.76390238  0.48128932 64.58941131
 43.75872113] with fitness value 0.7319090282563749
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 56
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 64.58941130666561
Best reg_lambda is 43.75872112626925
第 7 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121

0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
0.5533253688880515 37 9 80 0.4812893194050143 56.80445610939323 43.75872112626925
Best solution is [ 0.55332537 37.18352149  9.43868727 80.21387862  0.48128932 56.80445611
 43.75872113] with fitness value 0.7341144038594073
Best learning rate is 0.5533253688880515
Best min child samples is 37
Best max depth is 9
Best num leaves is 80
Best colsample bytree is 0.4812893194050143
Best reg_alpha is 56.80445610939323
Best reg_lambda is 43.75872112626925
第 4 折

0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 48 6 80 0.5760054277776141 56.80445610939323 92.5596638292661
0.08032569761590806 8 1 84 0.8003410758548655 87.00121482468191 97.8618342232764
0.892855270774259 37 6 56 0.5760054277776141 56.80445610939323 92.5596638292661
0.5533253688880515 37 9 56 0.4812893194050143 64.58941130666561 43.75872112626925
0.892855270774259 37 6 80 0.5760054277776141 56.80445

In [14]:
print(max__)

[0.6657477601654032, 0.7345279117849759, 0.7386629910406616, 0.7372846312887664, 0.7356305995864921, 0.7399035148173674, 0.7411440385940731, 0.7415575465196417, 0.7421088904203997, 0.7433494141971054, 0.7411440385940731, 0.7415575465196417, 0.7425223983459683, 0.7397656788421778, 0.7422467263955892]


In [15]:
count = 0
roc_ = []
acc_ = []
f1_ = []
recall_ = []
precision_ = []
ii = []

for i in tqdm_notebook(range(0, 1500, 100)):
    name = "data_w2v_" + str(i) + ".csv"
    data = pd.read_csv(name)

    category_col = ['attacktype1_txt', 'targsubtype1_txt', 'weapsubtype1_txt']
    data[category_col] = data[category_col].astype('category')
    X = data.drop(columns=['risk'], axis=1)
    y = data['risk']
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    splits = kf.split(X, y)
    
    #lightGbm 
    lgb_roc_scores = []
    lgb_acc_scores = []
    lgb_f1_scores = []
    lgb_recall_scores = []
    lgb_precision_scores = []
    lgb_feature_importances = pd.DataFrame(index=None)
    lgb_feature_importances['features'] = data.drop(['risk'], axis=1).columns

    for k, (train_indices, test_indices) in enumerate(splits):
        print("第 %d 折\n" % (k + 1))
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        labels = [0, 1, 2, 3]
        y_one_hot = label_binarize(y_test, classes=labels)

        LGB = lgb.LGBMClassifier(random_state=0, 
                                 learning_rate=lr__[count], 
                                 min_child_samples=mcs__[count],
                                 max_depth=md__[count], 
                                 num_leaves=nl__[count], 
                                 colsample_bytree=cb__[count],
                                 reg_alpha=ra__[count],
                                 reg_lambda=rl__[count],
                                )
        LGB.fit(X_train, y_train, categorical_feature=category_col)
        lgb_feature_importances[f'fold_{k+1}'] = LGB.feature_importances_
        y_pred_prob = LGB.predict_proba(X_test)
        y_pred = LGB.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_one_hot, y_pred_prob, multi_class="ovo", average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
    #     G_mean = math.sqrt(recall * specificity)
        print(f" Fold {k + 1} | " )
        print(f" AUC_ROC: { roc_auc * 100}%" )
        print(f" ACC: { acc * 100}%" )
        print(f" F1: { f1 * 100}%" )
        print(f" RECALL: { recall * 100}%" )
        print(f" PRECISION: { precision * 100}%" )
        lgb_f1_scores.append(f1)
        lgb_roc_scores.append(roc_auc)
        lgb_acc_scores.append(acc)
        lgb_recall_scores.append(recall)
        lgb_precision_scores.append(precision)
    count = count + 1
    
    ii.append(i)
    roc_.append(np.mean(lgb_roc_scores))
    acc_.append(np.mean(lgb_acc_scores))
    f1_.append(np.mean(lgb_f1_scores))
    recall_.append(np.mean(lgb_recall_scores))
    precision_.append(np.mean(lgb_precision_scores))
#     print(f'average roc score: {np.mean(lgb_roc_scores)}')
#     print(f'average acc_score: {np.mean(lgb_acc_scores)}')
#     print(f'average f1_score: {np.mean(lgb_f1_scores)}')
#     print(f'average recall_score: {np.mean(lgb_recall_scores)}')
#     print(f'average precision_score: {np.mean(lgb_precision_scores)}')
print(ii)
print(f'average roc score: {roc_}')
print(f'average acc_score: {acc_}')
print(f'average f1_score: {f1_}')
print(f'average recall_score: {recall_}')
print(f'average precision_score: {precision_}')

  0%|          | 0/15 [00:00<?, ?it/s]

第 1 折

 Fold 1 | 
 AUC_ROC: 85.8203385658022%
 ACC: 66.10613370089592%
 F1: 65.38838699531145%
 RECALL: 66.10613370089592%
 PRECISION: 65.2823793900078%
第 2 折

 Fold 2 | 
 AUC_ROC: 85.68169589008116%
 ACC: 65.16884906960718%
 F1: 64.51924163171235%
 RECALL: 65.16884906960718%
 PRECISION: 64.56169792549115%
第 3 折

 Fold 3 | 
 AUC_ROC: 86.02516852805526%
 ACC: 65.26533425223984%
 F1: 64.62498723706126%
 RECALL: 65.26533425223984%
 PRECISION: 64.74274909751783%
第 4 折

 Fold 4 | 
 AUC_ROC: 86.14994448721146%
 ACC: 65.73397656788423%
 F1: 65.12605096813078%
 RECALL: 65.73397656788423%
 PRECISION: 65.2392289353695%
第 5 折

 Fold 5 | 
 AUC_ROC: 86.4614149902275%
 ACC: 65.70640937284631%
 F1: 64.8898762391123%
 RECALL: 65.70640937284631%
 PRECISION: 64.8517698610826%
第 6 折

 Fold 6 | 
 AUC_ROC: 86.00557159858087%
 ACC: 65.48587181254307%
 F1: 64.81125495871167%
 RECALL: 65.48587181254307%
 PRECISION: 64.79430502842466%
第 7 折

 Fold 7 | 
 AUC_ROC: 86.22343578054782%
 ACC: 65.78911095796003%
 F1:

 Fold 4 | 
 AUC_ROC: 91.00702589021222%
 ACC: 73.43900758097863%
 F1: 73.28765063551253%
 RECALL: 73.43900758097863%
 PRECISION: 73.50139945028232%
第 5 折

 Fold 5 | 
 AUC_ROC: 91.28188150919668%
 ACC: 73.7698139214335%
 F1: 73.54051022099675%
 RECALL: 73.7698139214335%
 PRECISION: 73.73054121314289%
第 6 折

 Fold 6 | 
 AUC_ROC: 90.6015121165004%
 ACC: 72.83252929014473%
 F1: 72.6471404542872%
 RECALL: 72.83252929014473%
 PRECISION: 72.87422052251544%
第 7 折

 Fold 7 | 
 AUC_ROC: 91.03657676305012%
 ACC: 73.81116471399035%
 F1: 73.67294557532799%
 RECALL: 73.81116471399035%
 PRECISION: 73.8360840447231%
第 8 折

 Fold 8 | 
 AUC_ROC: 90.80552890648417%
 ACC: 73.43900758097863%
 F1: 73.349399573692%
 RECALL: 73.43900758097863%
 PRECISION: 73.5543193809164%
第 9 折

 Fold 9 | 
 AUC_ROC: 90.83720676649449%
 ACC: 73.99035148173674%
 F1: 73.89205680317056%
 RECALL: 73.99035148173674%
 PRECISION: 74.06184330186498%
第 10 折

 Fold 10 | 
 AUC_ROC: 90.6836130469816%
 ACC: 73.37008959338387%
 F1: 73.2060

 Fold 8 | 
 AUC_ROC: 90.79272282905609%
 ACC: 73.17711922811854%
 F1: 73.09198757852954%
 RECALL: 73.17711922811854%
 PRECISION: 73.29029979107459%
第 9 折

 Fold 9 | 
 AUC_ROC: 90.9883390786636%
 ACC: 73.92143349414198%
 F1: 73.82391754224363%
 RECALL: 73.92143349414198%
 PRECISION: 73.98056414971282%
第 10 折

 Fold 10 | 
 AUC_ROC: 90.60932171765657%
 ACC: 72.84631288766367%
 F1: 72.74515635303713%
 RECALL: 72.84631288766367%
 PRECISION: 72.9194932816738%
第 1 折

 Fold 1 | 
 AUC_ROC: 90.57451807504381%
 ACC: 73.32873880082703%
 F1: 73.23370028355993%
 RECALL: 73.32873880082703%
 PRECISION: 73.29715885187831%
第 2 折

 Fold 2 | 
 AUC_ROC: 90.63621984704449%
 ACC: 73.23225361819435%
 F1: 73.05014331616758%
 RECALL: 73.23225361819435%
 PRECISION: 73.22745398315121%
第 3 折

 Fold 3 | 
 AUC_ROC: 90.9284822874917%
 ACC: 73.12198483804274%
 F1: 72.97827444516778%
 RECALL: 73.12198483804274%
 PRECISION: 73.20288965564728%
第 4 折

 Fold 4 | 
 AUC_ROC: 91.0768035398007%
 ACC: 74.15575465196417%
 F1: 74

In [2]:
roc_ = [0.8601453118149568, 0.9036372590303688, 0.9028844543145015, 0.9059153770725032, 0.9060927651566617, 0.9077229124729762, 0.9073841282963929, 0.9076252938109131, 0.9076244764742624, 0.9072730739441435, 0.9085491927651018, 0.9076492354003749, 0.9079716861060863, 0.9084268464195251, 0.908650653036359]
acc_ = [0.6567470709855272, 0.727250172294969, 0.7262991040661613, 0.7314403859407304, 0.7305444521019986, 0.7329014472777395, 0.7345830461750518, 0.7331495520330806, 0.7329565816678153, 0.733039283252929, 0.7336044107512061, 0.732736044107512, 0.7351895244658856, 0.7345692625775329, 0.7364576154376292]
f1_ = [0.6501488484228044, 0.7257252377056265, 0.7250328796341726, 0.7300061605827395, 0.729133062262375, 0.7313210255316146, 0.7331956358334728, 0.7318455377113424, 0.731546423953336, 0.7316499289295509, 0.7321959774822207, 0.7314289685550757, 0.7338253535139541, 0.7331692307050912, 0.7352228559725098]
recall_ = [0.6567470709855272, 0.727250172294969, 0.7262991040661613, 0.7314403859407304, 0.7305444521019986, 0.7329014472777395, 0.7345830461750518, 0.7331495520330806, 0.7329565816678153, 0.733039283252929, 0.7336044107512061, 0.732736044107512, 0.7351895244658856, 0.7345692625775329, 0.7364576154376292]
precision_ = [0.6506413968205838, 0.7275136009908985, 0.7260205714481961, 0.7315394731728115, 0.7306028502121369, 0.7332631492455459, 0.7349614230013063, 0.7332392783637702, 0.7332273591361792, 0.733106174993847, 0.733771236899483, 0.7330213207539312, 0.7354319819806909, 0.7347359627653037, 0.7368277028701786]

In [3]:
#导入库
import matplotlib.pyplot as plt
%matplotlib
#设定画布。dpi越大图越清晰，绘图时间越久
fig=plt.figure(figsize=(10, 4), dpi=200)
#导入数据
x = range(0, 1500, 100)
x_ = range(0, 1500, 200)
y_1 = [0.86, 0.87, 0.88, 0.89, 0.90]
y_2 = [0.65, 0.66, 0.67, 0.68, 0.69, 0.70, 0.71, 0.72, 0.73]
y1 = roc_
y2 = acc_
y3 = f1_
y4 = recall_
y5 = precision_
#绘图命令
plt.subplot(1,2,1) # 子图的行、列、索引
plt.plot(x, y1, lw=1, ls='-', c='b', alpha=0.5, label='AUC')

plt.legend()  
plt.xticks(x_)
plt.yticks(y_1)
plt.title("Word2vec")
plt.xlabel("Feature dimension") 
plt.ylabel("Performance")

plt.subplot(1,2,2) # 子图的行、列、索引
plt.plot(x, y2, lw=1, ls='-', c='r', alpha=0.5, label='Accuracy')
plt.plot(x, y3, lw=1, ls='-', c='g', alpha=0.5, label='F1-score')
plt.plot(x, y4, lw=1, ls='-', c='k', alpha=0.5, label='Sensitivity')
plt.plot(x, y5, lw=1, ls='-', c='m', alpha=0.5, label='Precision')

plt.legend()  
plt.xticks(x_)
plt.yticks(y_2)
plt.title("Word2vec")
plt.xlabel("Feature dimension") 
plt.ylabel("Performance")

plt.show()

Using matplotlib backend: TkAgg


In [4]:
print(max(roc_))
print(max(acc_))
print(max(f1_))
print(max(recall_))
print(max(precision_))

0.908650653036359
0.7364576154376292
0.7352228559725098
0.7364576154376292
0.7368277028701786
