In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_pickle('data_ml.pickle')

In [3]:
# display every column and row
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head()

Unnamed: 0,latitude,longitude,eventid,iyear,imonth,iday,extended,country,region,specificity,vicinity,crit1,crit2,crit3,doubtterr,multiple,success,suicide,attacktype1,targtype1,targsubtype1,natlty1,guncertain1,individual,nperpcap,claimed,weaptype1,weapsubtype1,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,propextent,ishostkid,ransom,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
4,37.005105,-89.176269,197001010002,1970,1,1,0,217,1,1.0,0.0,1,1,1,0.0,0.0,1,0,2,3,22.0,217.0,0.0,0,0.127628,0.0,5,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5,43.4685,-89.744299,197001050001,1970,1,1,0,217,1,1.0,0.0,1,1,0,1.0,0.0,0,0,3,4,27.0,217.0,0.0,0,0.127628,0.0,6,16.0,0.0,0.045981,0.508058,0.0,0.038944,0.107163,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-34.891151,-56.187214,197001020001,1970,1,2,0,218,3,1.0,0.0,1,1,1,0.0,0.0,0,0,1,3,25.0,218.0,0.0,0,0.127628,0.0,5,2.0,0.0,0.045981,0.508058,0.0,0.038944,0.107163,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
7,37.791927,-122.225906,197001020002,1970,1,2,0,217,1,1.0,0.0,1,1,1,1.0,0.0,1,0,3,21,107.0,217.0,0.0,0,0.127628,0.0,6,16.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
8,43.076592,-89.412488,197001020003,1970,1,2,0,217,1,1.0,0.0,1,1,1,0.0,0.0,1,0,7,4,28.0,217.0,0.0,0,1.0,1.0,8,19.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


# Predict Success (CLassification)

#### Use Gradient Boosting to find feature importance

In [4]:
# columns
success = df[['success']]
cols = df.drop(df[['success']], axis=1)
# split dataset into earlier and later years
X_train, X_test, y_train, y_test = train_test_split(cols, success, train_size=0.8, shuffle=False, random_state=0)
print(X_train.shape)
print(X_test.shape)

(144640, 41)
(36160, 41)


In [5]:
gbc = GradientBoostingClassifier().fit(X_train, y_train.values.ravel())

In [6]:
y_train_pred = gbc.predict(X_train)
y_test_pred = gbc.predict(X_test)

In [7]:
# Define function to calculate feature importance
def feature_importance(model):
    importance = model.feature_importances_
    importance = pd.DataFrame(importance, columns=["importance"])
    variable = X_train.columns
    variable = pd.DataFrame(variable, columns=["variable"])
    # feature importance dataframe
    f_imp = pd.concat([variable,importance], axis=1)
    f_imp = f_imp.sort_values(by="importance", ascending=False)
    f_imp = f_imp.reset_index(drop=True)
    # get proportion of importance compared to highest variable
    top = f_imp['importance'][0]
    f_imp['proportion'] = f_imp['importance']/top
    return f_imp

In [8]:
importance = feature_importance(gbc)
importance.head(16)

Unnamed: 0,variable,importance,proportion
0,nkill,0.31346,1.0
1,attacktype1,0.22497,0.717702
2,property,0.201331,0.642287
3,ishostkid,0.08676,0.276781
4,nwound,0.077139,0.24609
5,targtype1,0.022763,0.07262
6,eventid,0.020855,0.066532
7,nkillter,0.010976,0.035015
8,weapsubtype1,0.00801,0.025552
9,latitude,0.00735,0.023447


- We won't be using nkill as a predictor since we will be predicting nkill using the predicted success.  
- The variables `'nwound'`, `'nkillter'` are related to `'nkill'` and would only be known after the success or failure of the terrorist attack and hence cannot be used to predict success.  
- `'eventid'` is not a variable relevant to the terrorist attack and will not be used.  
- `'latitude'` and `'longitude'` will be replaced by `'country'` since they are related.

In [9]:
# select variables
select = ['attacktype1','property','ishostkid','targtype1','targsubtype1','weapsubtype1','weaptype1', 'country']

### Model 1: Logistic Regression

In [10]:
success = df[['success']]   # response
var = df[select]            # predictor
# split dataset into earlier and later years
X_train, X_test, y_train, y_test = train_test_split(var, success, train_size=0.8, shuffle=False, random_state=0)
# standardise features
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)
print(X_train.shape)
print(X_test.shape)


(144640, 8)
(36160, 8)


In [11]:
lrc = LogisticRegression().fit(X_train, y_train.values.ravel())

In [12]:
y_train_pred = lrc.predict(X_train)
y_test_pred = lrc.predict(X_test)

#### Accuracy score

In [13]:
print("Logistic Regression Score")
print("f1 score:", f1_score(y_test, y_test_pred, average='weighted'))
print("roc auc :", roc_auc_score(y_test, y_test_pred))

Logistic Regression Score
f1 score: 0.748311194374463
roc auc : 0.5186245220799799


### Model 2: Multi-Layer Perceptron Classifier

In [14]:
# split dataset into earlier and later years
X_train, X_test, y_train, y_test = train_test_split(var, success, train_size=0.8, shuffle=False, random_state=0)
mlpc = MLPClassifier().fit(X_train, y_train.values.ravel())

In [15]:
y_train_pred = mlpc.predict(X_train)
y_test_pred = mlpc.predict(X_test)

#### Accuracy score

In [16]:
print("Multi-Layer Perceptron Score")
print("f1 score:", f1_score(y_test, y_test_pred, average='weighted'))
print("roc auc :", roc_auc_score(y_test, y_test_pred))

Multi-Layer Perceptron Score
f1 score: 0.8392161549650556
roc auc : 0.6646513196408674


### Model 3: Gradient Boosting

In [17]:
# train data again with selected features
gbc = GradientBoostingClassifier().fit(X_train, y_train.values.ravel())

In [18]:
y_train_pred = gbc.predict(X_train)
y_test_pred = gbc.predict(X_test)

#### Accuracy Score

In [19]:
print("Gradient Boosting Score")
print("f1 score:", f1_score(y_test, y_test_pred, average='weighted'))
print("roc auc :", roc_auc_score(y_test, y_test_pred))

Gradient Boosting Score
f1 score: 0.8562334645574373
roc auc : 0.695139205702856


Gradient Boosting Model predicts success with the highest accuracy

In [20]:
# combine predicted test and train as a new column
gb_success_pred_train = pd.DataFrame(y_train_pred, index=X_train.index,columns=["success_pred"])
gb_success_pred_test = pd.DataFrame(y_test_pred, index=X_test.index,columns=["success_pred"])
gb_success_pred = pd.concat([gb_success_pred_train, gb_success_pred_test], axis=0)

# Predicting nkill (Using success)

In [21]:
nkill = df['nkill']         # target           
success = gb_success_pred   # predictor --> previously predicted
variables = df.drop(columns=['nkill'])  # predictor
variables['success'] = success  # replace actual success data with predicted
# split dataset into earlier and later years
X_train, X_test, y_train, y_test = train_test_split(variables, nkill, train_size=0.8, shuffle=False, random_state=0)
print(X_train.shape)
print(X_test.shape)

(144640, 41)
(36160, 41)


## Get important variables

In [22]:
gbr = GradientBoostingRegressor().fit(X_train, y_train.values.ravel())

In [23]:
importance = feature_importance(gbr)
importance.head(15)

Unnamed: 0,variable,importance,proportion
0,nwound,0.375366,1.0
1,nkillus,0.278917,0.743053
2,nkillter,0.191137,0.509202
3,latitude,0.028796,0.076715
4,weapsubtype1,0.01928,0.051364
5,region,0.014866,0.039604
6,ishostkid,0.01479,0.039402
7,targtype1,0.011369,0.030287
8,attacktype1,0.00964,0.025681
9,eventid,0.00683,0.018194


- The variables `'nwound'`, `'nkillter'`, `'nkillus'` and `'nwoundte'` are related to `'nkill'` and would only be known after the success or failure of the terrorist attack and hence cannot be used to predict success.  
- `'eventid'` is not a variable relevant to the terrorist attack and will not be used.  
- `'latitude'` will be replaced by `'region'` since they are related.

In [24]:
# select variables
select = ['weapsubtype1','region','ishostkid','attacktype1','targtype1', 'natlty1', 'targsubtype1']

## Predicting without success

In [26]:
nkill = df['nkill']       # target           
variables = df[select]    # predictor
# split dataset into earlier and later years
X_train, X_test, y_train, y_test = train_test_split(variables, nkill, train_size=0.8, shuffle=False, random_state=0)
print(X_train.shape)
print(X_test.shape)

(144640, 7)
(36160, 7)


### Model 1: Random Forest Regressor

In [27]:
rfr = RandomForestRegressor(max_depth=4, random_state=0).fit(X_train, y_train.values.ravel())

#### Accuracy Score

In [28]:
y_test_pred = rfr.predict(X_test)
print(f"RMSE\t:", root_mean_squared_error(y_test, y_test_pred))
print(f"MAE\t:", mean_absolute_error(y_test, y_test_pred))

RMSE	: 9.46121824699501
MAE	: 2.934179956834248


### Model 2: Gradient Boosting Regressor

In [29]:
gbr = GradientBoostingRegressor().fit(X_train, y_train.values.ravel())

### Accuracy Score

In [30]:
y_test_pred = gbr.predict(X_test)
print("Gradient Boosting Score")
print(f"RMSE\t:", root_mean_squared_error(y_test, y_test_pred))
print(f"MAE\t:", mean_absolute_error(y_test, y_test_pred))

Gradient Boosting Score
RMSE	: 9.466076213998495
MAE	: 3.044484173799251


### Model 3: Multi-Layer Perceptron Regressor

In [31]:
mlpr = MLPRegressor(max_iter=500).fit(X_train, y_train.values.ravel())

#### Accuracy Score

In [32]:
y_test_pred = mlpr.predict(X_test)
print("Multi-Layer Perceptron Score")
print(f"RMSE\t:", root_mean_squared_error(y_test, y_test_pred))
print(f"MAE\t:", mean_absolute_error(y_test, y_test_pred))

Multi-Layer Perceptron Score
RMSE	: 9.577652837955348
MAE	: 3.0230113767793463


Random Forest Regressor model has the highest accuracy

In [33]:
variables = pd.concat([variables,success],axis=1)
# split dataset into earlier and later years
X_train, X_test, y_train, y_test = train_test_split(variables, nkill, train_size=0.8, shuffle=False)
print(X_train.shape)
print(X_test.shape)

(144640, 8)
(36160, 8)


## Predicting with success

In [34]:
rfr = RandomForestRegressor().fit(X_train, y_train.values.ravel())

In [36]:
y_test_pred = rfr.predict(X_test)
print("Ranfom Forest Score")
print(f"RMSE\t:", root_mean_squared_error(y_test, y_test_pred))
print(f"MAE\t:", mean_absolute_error(y_test, y_test_pred))

Gradient Boosting Score
RMSE	: 9.409707970721563
MAE	: 2.9879960827213354


Adding success as a predictor slightly improves accuracy of prediction of nkill 