In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import calendar
import missingno as msno
from matplotlib import rcParams
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_excel("globalterrorismdb_0522dist.xlsx")

In [4]:
df

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,NaT,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,NaT,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,NaT,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,NaT,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,NaT,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209701,202012310015,2020,12,31,2020-12-31 00:00:00,0,NaT,228,Yemen,10,...,,"""Al Houthi militia escalated in Hays and targe...",,,START Primary Collection,0,0,0,0,
209702,202012310016,2020,12,31,2020-12-31 00:00:00,0,NaT,228,Yemen,10,...,,"""Al Houthi militia escalated in Hays and targe...",,,START Primary Collection,0,0,0,0,
209703,202012310017,2020,12,31,,0,NaT,75,Germany,8,...,,"""Far-left arson attack suspected on German asy...","""Fire of Bundeswehr vehicles in Leipzig, proba...","""Anarchist Antifa Take Credit for Arson Attack...",START Primary Collection,-9,-9,0,-9,
209704,202012310018,2020,12,31,,0,NaT,4,Afghanistan,6,...,,"""Civil society activist and tribal elder kille...","""Terrorism Digest: 1-2 Jan 21,"" BBC Monitoring...",,START Primary Collection,-9,-9,0,-9,


In [3]:
df1 = df.loc[:, ['iyear', 'imonth', 'iday', 'country_txt', 'targtype1_txt', 'attacktype1_txt', 'gname']]
df1 

Unnamed: 0,iyear,imonth,iday,country_txt,targtype1_txt,attacktype1_txt,gname
0,1970,7,2,Dominican Republic,Private Citizens & Property,Assassination,MANO-D
1,1970,0,0,Mexico,Government (Diplomatic),Hostage Taking (Kidnapping),23rd of September Communist League
2,1970,1,0,Philippines,Journalists & Media,Assassination,Unknown
3,1970,1,0,Greece,Government (Diplomatic),Bombing/Explosion,Unknown
4,1970,1,0,Japan,Government (Diplomatic),Facility/Infrastructure Attack,Unknown
...,...,...,...,...,...,...,...
209701,2020,12,31,Yemen,Private Citizens & Property,Bombing/Explosion,Houthi extremists (Ansar Allah)
209702,2020,12,31,Yemen,Private Citizens & Property,Bombing/Explosion,Houthi extremists (Ansar Allah)
209703,2020,12,31,Germany,Military,Facility/Infrastructure Attack,Left-wing extremists
209704,2020,12,31,Afghanistan,Private Citizens & Property,Armed Assault,Unknown


In [4]:
df1.rename(columns=
           {'iyear': 'year',
            'imonth':'month',
            'iday':'day',
            'country_txt': 'country',
            'targtype1_txt': 'targetType',
            'attacktype1_txt': 'attackType',
            'gname': 'terroristGroup'}, 
           inplace=True)

In [5]:
df1.isnull().any()

year              False
month             False
day               False
country           False
targetType        False
attackType        False
terroristGroup    False
dtype: bool

In [7]:
df1

Unnamed: 0,year,month,day,country,targetType,attackType,terroristGroup
0,1970,7,2,Dominican Republic,Private Citizens & Property,Assassination,MANO-D
1,1970,0,0,Mexico,Government (Diplomatic),Hostage Taking (Kidnapping),23rd of September Communist League
2,1970,1,0,Philippines,Journalists & Media,Assassination,Unknown
3,1970,1,0,Greece,Government (Diplomatic),Bombing/Explosion,Unknown
4,1970,1,0,Japan,Government (Diplomatic),Facility/Infrastructure Attack,Unknown
...,...,...,...,...,...,...,...
209701,2020,12,31,Yemen,Private Citizens & Property,Bombing/Explosion,Houthi extremists (Ansar Allah)
209702,2020,12,31,Yemen,Private Citizens & Property,Bombing/Explosion,Houthi extremists (Ansar Allah)
209703,2020,12,31,Germany,Military,Facility/Infrastructure Attack,Left-wing extremists
209704,2020,12,31,Afghanistan,Private Citizens & Property,Armed Assault,Unknown


In [13]:
df1.to_csv('dataset.csv', index=False)

In [9]:
dataset = pd.read_csv('dataset.csv')

### 1. Predict Terrorist Group

In [9]:
X = dataset[['country', 'targetType', 'attackType']]
y = dataset['terroristGroup']

In [10]:
# Convert categorical variables into numerical using one-hot encoding
X = pd.get_dummies(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [19]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print("Accuracy: ", accuracy)
print("Balanced Accuracy: ", balanced_acc)
print("F1 score: ", f1)



Accuracy:  0.6137096774193549
Balanced Accuracy:  0.10478038293441813
F1 score:  0.08507298700622724


### 2. Predict Target Type

In [10]:
X = dataset[['country', 'attackType', 'terroristGroup']]
y = dataset['targetType']

In [11]:
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print("Accuracy: ", accuracy)
print("Balanced Accuracy: ", balanced_acc)
print("F1 score: ", f1)

Accuracy:  0.4415959252971138
Balanced Accuracy:  0.26293980039819337
F1 score:  0.28736790017273733
