In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.book import *
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm_notebook
import joblib
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import defaultdict
from gensim import corpora
from gensim import models
import warnings
warnings.filterwarnings('ignore')

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
data_1 = pd.read_csv("globalterrorismdb_0522dist.csv")
print("data_1_shape: ",data_1.shape)

data_2 = pd.read_csv("gtd1993_0221dist.csv")
print("data_2_shape: ", data_2.shape)

data_1_shape:  (209706, 135)
data_2_shape:  (748, 135)


In [3]:
data = pd.concat([data_1, data_2], axis=0, ignore_index=True)
print("data_shape: ", data.shape)

data_shape:  (210454, 135)


In [4]:
data.to_csv('data_concat.csv', index=False, encoding='utf_8_sig')

In [5]:
data = pd.read_csv("data_concat.csv")
print("data_concat: ", data.shape)

data_concat:  (210454, 135)


In [6]:
null_columns = [ col for col in data.columns if data[col].isnull().sum() / data.shape[0] > 0.7 ]
print(null_columns)

['approxdate', 'resolution', 'alternative', 'alternative_txt', 'attacktype2', 'attacktype2_txt', 'attacktype3', 'attacktype3_txt', 'targtype2', 'targtype2_txt', 'targsubtype2', 'targsubtype2_txt', 'corp2', 'target2', 'natlty2', 'natlty2_txt', 'targtype3', 'targtype3_txt', 'targsubtype3', 'targsubtype3_txt', 'corp3', 'target3', 'natlty3', 'natlty3_txt', 'gsubname', 'gname2', 'gsubname2', 'gname3', 'gsubname3', 'motive', 'guncertain2', 'guncertain3', 'claimmode', 'claimmode_txt', 'claim2', 'claimmode2', 'claimmode2_txt', 'claim3', 'claimmode3', 'claimmode3_txt', 'compclaim', 'weaptype2', 'weaptype2_txt', 'weapsubtype2', 'weapsubtype2_txt', 'weaptype3', 'weaptype3_txt', 'weapsubtype3', 'weapsubtype3_txt', 'weaptype4', 'weaptype4_txt', 'weapsubtype4', 'weapsubtype4_txt', 'propvalue', 'nhostkid', 'nhostkidus', 'nhours', 'ndays', 'divert', 'kidhijcountry', 'ransomamt', 'ransomamtus', 'ransompaid', 'ransompaidus', 'ransomnote', 'hostkidoutcome', 'hostkidoutcome_txt', 'nreleased', 'addnotes', 

In [7]:
data.drop(columns=null_columns, axis=1, inplace=True)
data.shape

(210454, 64)

In [8]:
data.to_csv('data_0.7nulloff.csv', index=False, encoding='utf_8_sig')

In [9]:
data = pd.read_csv("data_0.7nulloff.csv")
print("data_0.7nulloff: ", data.shape)
print(data)

data_0.7nulloff:  (210454, 64)
             eventid  iyear  imonth  iday  extended  country  \
0       197000000001   1970       7     2         0       58   
1       197000000002   1970       0     0         0      130   
2       197001000001   1970       1     0         0      160   
3       197001000002   1970       1     0         0       78   
4       197001000003   1970       1     0         0      101   
...              ...    ...     ...   ...       ...      ...   
210449  199312280002   1993      12    28         0      159   
210450  199312300001   1993      12    30         0      603   
210451  199312300002   1993      12    30         0      603   
210452  199312300003   1993      12    30         0      603   
210453  199312300004   1993      12    30         0      183   

               country_txt  region                   region_txt  \
0       Dominican Republic       2  Central America & Caribbean   
1                   Mexico       1                North America   

In [10]:
drop_col = ["eventid", "country", "region", "attacktype1", "targtype1", "targsubtype1", 
            "natlty1", "scite1", "scite2", "dbsource", "weaptype1", "weapsubtype1"]

In [11]:
data.drop(columns=drop_col, axis=1, inplace=True)
data.shape

(210454, 52)

In [12]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (data.crit1[row] == 0) | (data.crit2[row] == 0) | (data.crit3[row] == 0):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)
print(data)

  0%|          | 0/210454 [00:00<?, ?it/s]

        iyear  imonth  iday  extended         country_txt  \
0        1970       7     2         0  Dominican Republic   
1        1970       0     0         0              Mexico   
2        1970       1     0         0         Philippines   
3        1970       1     0         0              Greece   
4        1970       1     0         0               Japan   
...       ...     ...   ...       ...                 ...   
178437   1993      12    28         0                Peru   
178438   1993      12    30         0      United Kingdom   
178439   1993      12    30         0      United Kingdom   
178440   1993      12    30         0      United Kingdom   
178441   1993      12    30         0        South Africa   

                         region_txt                provstate           city  \
0       Central America & Caribbean                 National  Santo Domingo   
1                     North America                  Federal    Mexico city   
2                    Southeast

In [13]:
data.drop(columns=["crit1", "crit2", "crit3"], axis=1, inplace=True)
data.shape
data.to_csv('data_nocrit.csv', index=False, encoding='utf_8_sig')

In [2]:
data = pd.read_csv("data_nocrit.csv")
print("data_nocrit.csv: ", data.shape)
print(data)

data_nocrit.csv:  (178442, 49)
        iyear  imonth  iday  extended         country_txt  \
0        1970       7     2         0  Dominican Republic   
1        1970       0     0         0              Mexico   
2        1970       1     0         0         Philippines   
3        1970       1     0         0              Greece   
4        1970       1     0         0               Japan   
...       ...     ...   ...       ...                 ...   
178437   1993      12    28         0                Peru   
178438   1993      12    30         0      United Kingdom   
178439   1993      12    30         0      United Kingdom   
178440   1993      12    30         0      United Kingdom   
178441   1993      12    30         0        South Africa   

                         region_txt                provstate           city  \
0       Central America & Caribbean                 National  Santo Domingo   
1                     North America                  Federal    Mexico city   

In [3]:
i = j = k = l = 0
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.nkill[row]) == True) and (pd.isnull(data.nkillter[row]) == False):
        data.nkill[row] = data.nkillter[row]
        i = i + 1
    if (pd.isnull(data.nwound[row]) == True) and (pd.isnull(data.nwoundte[row]) == False):
        data.nwound[row] = data.nwoundte[row]
        j = j + 1
    if (pd.isnull(data.nkill[row]) == False) and (pd.isnull(data.nkillter[row]) == True):
        data.nkillter[row] = 0
        k= k + 1
    if (pd.isnull(data.nwound[row]) == False) and (pd.isnull(data.nwoundte[row]) == True):
        data.nwoundte[row] = 0
        l = l + 1
data.shape
print(i, j , k, l)

  0%|          | 0/178442 [00:00<?, ?it/s]

3982 6956 50990 50174


In [4]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.nkill[row]) == True) | (pd.isnull(data.nkillter[row]) == True) | (pd.isnull(data.nwound[row]) == True) | (pd.isnull(data.nwoundte[row]) == True) | (pd.isnull(data.property[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)
print(data)

  0%|          | 0/178442 [00:00<?, ?it/s]

        iyear  imonth  iday  extended         country_txt  \
0        1970       7     2         0  Dominican Republic   
1        1970       0     0         0              Mexico   
2        1970       1     0         0         Philippines   
3        1970       1     1         0       United States   
4        1970       1     2         0             Uruguay   
...       ...     ...   ...       ...                 ...   
169846   1993      12    28         0                Peru   
169847   1993      12    30         0      United Kingdom   
169848   1993      12    30         0      United Kingdom   
169849   1993      12    30         0      United Kingdom   
169850   1993      12    30         0        South Africa   

                         region_txt                provstate           city  \
0       Central America & Caribbean                 National  Santo Domingo   
1                     North America                  Federal    Mexico city   
2                    Southeast

In [5]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    data.nkill[row] = data.nkill[row] - data.nkillter[row]
    data.nwound[row] = data.nwound[row] - data.nwoundte[row]

    if (data.nkill[row] < 0) | (data.nwound[row] < 0):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)
    
data.shape

  0%|          | 0/169851 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (data.property[row] == -9):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)
print(data)

In [None]:
data.insert(loc=len(data.columns), column="risk", value=None)
print(data)

In [20]:
n = m = l = z = 0
for row in tqdm_notebook(range(data.shape[0])):
    if (data.nkill[row] == 0) and (data.nwound[row] == 0) and (data.property[row] == 0):
        data.risk[row] = 0
        n = n + 1
    elif (data.nkill[row] == 0) and (data.nwound[row] == 0) and (data.property[row] == 1):
        data.risk[row] = 1
        m = m + 1
    elif (data.nkill[row] == 0) and (data.nwound[row] > 0) and (data.property[row] == 0):
        data.risk[row] = 2
        l = l + 1
    elif (data.nkill[row] > 0) and (data.nwound[row] == 0) and (data.property[row] == 0):
        data.risk[row] = 2
        l = l + 1
    elif (data.nkill[row] > 0) and (data.nwound[row] > 0) and (data.property[row] == 0):
        data.risk[row] = 2
        l = l + 1
    elif (data.nkill[row] > 0) and (data.nwound[row] > 0) and (data.property[row] > 0): 
        data.risk[row] = 3
        z = z + 1
    elif (data.nkill[row] == 0) and (data.nwound[row] > 0) and (data.property[row] > 0): 
        data.risk[row] = 3
        z = z + 1
    elif (data.nkill[row] > 0) and (data.nwound[row] == 0) and (data.property[row] > 0): 
        data.risk[row] = 3
        z = z + 1
print(n, m, l, z, n + m + l + z)

  0%|          | 0/147675 [00:00<?, ?it/s]

23588 44291 40027 39769 147675


In [21]:
data.drop(columns=["nkill", "nkillus", "nkillter", "nwound", "nwoundus", "nwoundte"], axis=1, inplace=True)
data.shape

(147675, 44)

In [22]:
data.to_csv('data_risk.csv', index=False, encoding='utf_8_sig')

In [23]:
data = pd.read_csv("data_risk.csv")
print("data_risk.csv: ", data.shape)
print(data)

data_risk.csv:  (147675, 44)
        iyear  imonth  iday  extended         country_txt  \
0        1970       7     2         0  Dominican Republic   
1        1970       0     0         0              Mexico   
2        1970       1     0         0         Philippines   
3        1970       1     1         0       United States   
4        1970       1     2         0             Uruguay   
...       ...     ...   ...       ...                 ...   
147670   1993      12    28         0                Peru   
147671   1993      12    30         0      United Kingdom   
147672   1993      12    30         0      United Kingdom   
147673   1993      12    30         0      United Kingdom   
147674   1993      12    30         0        South Africa   

                         region_txt                provstate           city  \
0       Central America & Caribbean                 National  Santo Domingo   
1                     North America                  Federal    Mexico city   
2

In [24]:
data.drop(columns=["property", "propextent", "propextent_txt"], axis=1, inplace=True)
data.shape

(147675, 41)

In [25]:
data.drop(columns=["INT_LOG", "INT_IDEO", "INT_MISC", "INT_ANY"], axis=1, inplace=True)
data.shape

(147675, 37)

In [26]:
data.drop(columns=["weapdetail", "corp1", "target1"], axis=1, inplace=True)
data.shape

(147675, 34)

In [27]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.summary[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/147675 [00:00<?, ?it/s]

In [28]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.weapsubtype1_txt[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/98213 [00:00<?, ?it/s]

In [29]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.targsubtype1_txt[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/90280 [00:00<?, ?it/s]

In [30]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.latitude[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/82531 [00:00<?, ?it/s]

In [31]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.city[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/82077 [00:00<?, ?it/s]

In [32]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.natlty1_txt[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/81853 [00:00<?, ?it/s]

In [33]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.ishostkid[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/81707 [00:00<?, ?it/s]

In [34]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.ishostkid[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/81577 [00:00<?, ?it/s]

In [35]:
data.isnull().sum().sort_values(ascending=False)

ransom              73719
location            41997
propcomment         39419
nperps               8382
nperpcap             1820
guncertain1           191
multiple                1
claimed                 1
targsubtype1_txt        0
natlty1_txt             0
gname                   0
iyear                   0
individual              0
attacktype1_txt         0
weaptype1_txt           0
weapsubtype1_txt        0
ishostkid               0
targtype1_txt           0
suicide                 0
imonth                  0
success                 0
doubtterr               0
summary                 0
vicinity                0
specificity             0
longitude               0
latitude                0
city                    0
provstate               0
region_txt              0
country_txt             0
extended                0
iday                    0
risk                    0
dtype: int64

In [36]:
data.drop(columns=["ransom", "location", "propcomment", "nperpcap", "guncertain1", "claimed"], axis=1, inplace=True)
data.shape

(81577, 28)

In [37]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.nperps[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/81577 [00:00<?, ?it/s]

In [38]:
d_row = []
for row in tqdm_notebook(range(data.shape[0])):
    if (pd.isnull(data.multiple[row]) == True):
        d_row.append(row)
data.drop(labels=d_row, axis=0, inplace=True)
data.reset_index(inplace=True, drop=True)

  0%|          | 0/73195 [00:00<?, ?it/s]

In [39]:
data.isnull().sum().sort_values(ascending=False)

iyear               0
imonth              0
ishostkid           0
weapsubtype1_txt    0
weaptype1_txt       0
nperps              0
individual          0
gname               0
natlty1_txt         0
targsubtype1_txt    0
targtype1_txt       0
attacktype1_txt     0
suicide             0
success             0
multiple            0
doubtterr           0
summary             0
vicinity            0
specificity         0
longitude           0
latitude            0
city                0
provstate           0
region_txt          0
country_txt         0
extended            0
iday                0
risk                0
dtype: int64

In [40]:
data.shape

(73194, 28)

In [41]:
data.to_csv('data_final.csv', index=False, encoding='utf_8_sig')