In [73]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import make_scorer

NOTE: we changed the data in INJURED and FATALITIES to 0 and 1 with the usage of bins(1 means there was at least one or more, 0 means none).

# Functions that will be used

In [74]:
def load_dataset(file_name):
    df = pd.read_csv(file_name)
    df=df.drop(columns=["Unnamed: 0"])
    return df

In [75]:
def split_df_to_x_y(df,target_column):
    y=df[target_column]
    features = [col for col in df.columns]
    features.remove(target_column)
    X=df[features]
    return X,y

In [4]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)
    return X_train, X_test, y_train, y_test
    
    

In [5]:
def scale_features(X_train, scale_type):
    
    if scale_type == 'minmax' :
        
        scaler = MinMaxScaler(feature_range=(0, 1))
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_train)
        return scaler,X_train_scaled

    
    else:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_train)
        return scaler,X_train_scaled


In [6]:
def scale_test_features(X_test, scaler):
    return scaler.transform(X_test)

In [7]:
def train_model(X_train, y_train):
    trained_model= LogisticRegression().fit(X_train,y_train)
    return trained_model

In [9]:
def predict(classifier, X_test, y_test):
    y_pred=classifier.predict(X_test)
    resDF=pd.DataFrame({"Actual":y_test,"Predicted":y_pred})
    temp = pd.DataFrame({"Predicted":y_pred})
    return resDF,temp

In [31]:
def evaluate_performance(y_test,y_predicted):
    evaluate_value = f1_score(y_test,y_predicted)
    return evaluate_value


# Logistic Regression -Flow: 
## Question? will there be Fatalities? Yes/No

At first we wanted to know if we can predict Fatalities in Terror attack with the usage of logistic regression

In [32]:
df= load_dataset('df_to_model.csv')

In [33]:
df.head(5)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


In [34]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,2000],labels=[0,1])

In [35]:
df["FATALITIES"] = new_fatalities

In [36]:
df.FATALITIES.describe()

count     114056
unique         2
top            0
freq       57121
Name: FATALITIES, dtype: int64

In [37]:
df.head(5)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,1,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


#### standard scalar usage
We want to see the results with standard scalar

In [38]:
target_column='FATALITIES'

X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'standard'
standard_scaler, X_train_standard_scaled = scale_features(X_train, scale_type)

X_test_standard_scaled = scale_test_features(X_test, standard_scaler)

classification_standard_model=train_model(X_train_standard_scaled, y_train)

df_standard_res,y_predicted = predict(classification_standard_model, X_test_standard_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [39]:
df_standard_res.head()

Unnamed: 0,Actual,Predicted
16940,0,0
48817,0,0
102424,0,0
38280,0,1
29268,0,0


In [40]:
evaluate

0.7753419852683269

As we can see we got pretty good results at the beggining with the usage of standard scalar (0.77 out of 1)

#### minmax scalar usage
after standrad scalar we wondered if there is any way to make our prediction a little better.
<br> 
Thats when we came across minmax scalar.

In [41]:
target_column='FATALITIES'
X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'minmax'
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, scale_type)

X_test_minmax_scaled = scale_test_features(X_test, minmax_scaler)

classification_minmax_model = train_model(X_train_minmax_scaled, y_train)

df_minmax_res,y_predicted = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [42]:
df_minmax_res.head()

Unnamed: 0,Actual,Predicted
16940,0,0
48817,0,0
102424,0,0
38280,0,1
29268,0,0


In [43]:
evaluate

0.7491250635637582

It appears that minmax scalar is almost as good as standard. (0.749 comapred to 0.77) 

# Logistic Regression -Flow:
## Question? will there be Injured? Yes/No

We had a lot of relevant data, so we decided to ask another question which is: can we predict if there will be injured in terror attack?

In [44]:
df= load_dataset('df_to_model.csv')

In [45]:
df.head(5)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


In [46]:
df.INJURED.describe()

count    114056.00000
mean          3.43241
std          37.77107
min           0.00000
25%           0.00000
50%           0.00000
75%           2.00000
max       10878.00000
Name: INJURED, dtype: float64

In [47]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114056 entries, 0 to 114055
Data columns (total 16 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   COUNTRY                    114056 non-null  int64   
 1   CITY                       114056 non-null  int64   
 2   FATALITIES                 114056 non-null  int64   
 3   INJURED                    114056 non-null  category
 4   TARGET_TYPE                114056 non-null  int64   
 5   ATTACK_TYPE                114056 non-null  int64   
 6   SUCCESSFUL_ATTACK          114056 non-null  int64   
 7   HOSTAGES                   114056 non-null  int64   
 8   RANSOM                     114056 non-null  int64   
 9   PROPERTY_DAMAGE            114056 non-null  int64   
 10  SUICIDE_ATTACK             114056 non-null  int64   
 11  PART_OF_MULTIPLE_INCIDENT  114056 non-null  int64   
 12  TYPE_OF_WEAPON_LIST        114056 non-null  int64   
 13  SUB_TYPE_OF_WE

#### standard scalar usage

In [49]:
target_column='INJURED'

X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'standard'
standard_scaler, X_train_standard_scaled = scale_features(X_train, scale_type)

X_test_standard_scaled = scale_test_features(X_test, standard_scaler)

classification_standard_model=train_model(X_train_standard_scaled, y_train)

df_standard_res,y_predicted = predict(classification_standard_model, X_test_standard_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [50]:
df_standard_res

Unnamed: 0,Actual,Predicted
16940,1,0
48817,0,1
102424,0,0
38280,1,0
29268,1,1
...,...,...
95853,0,0
22579,0,1
1430,0,0
18463,1,0


In [51]:
evaluate

0.5211326081541144

As we can see we got some pretty bad results with standard scalar here (0.52 out of 1).
<br>
we decided to try minmax scalar as well

#### minmax scalar usage

In [53]:
df= load_dataset('df_to_model.csv')

In [54]:
df.head(5)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


In [55]:
df.INJURED.describe()

count    114056.00000
mean          3.43241
std          37.77107
min           0.00000
25%           0.00000
50%           0.00000
75%           2.00000
max       10878.00000
Name: INJURED, dtype: float64

In [56]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [57]:
target_column='INJURED'
X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'minmax'
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, scale_type)

X_test_minmax_scaled = scale_test_features(X_test, minmax_scaler)

classification_minmax_model = train_model(X_train_minmax_scaled, y_train)

df_minmax_res,y_predicted = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [58]:
df_minmax_res

Unnamed: 0,Actual,Predicted
16940,1,0
48817,0,1
102424,0,0
38280,1,0
29268,1,1
...,...,...
95853,0,0
22579,0,1
1430,0,0
18463,1,0


In [59]:
evaluate

0.5147553231228987

We got some pretty bad results here as well.
<br> we thought that maybe we had some irrelevant data regarding this question and maybe that affects the f1 score

### Trying to get rid of bad data section:

In [60]:
df= load_dataset('df_to_model.csv')

At first, we used df.head(35) to "feel the data"

In [62]:
df.head(35)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019
5,143,17004,2,4,317,0,1,0,0,1,0,0,4,9,12,2019
6,136,22246,1,0,143,1,1,0,0,0,0,0,2,7,12,2019
7,76,19277,0,1,379,3,1,0,0,1,0,0,4,28,12,2019
8,76,17330,0,0,22,3,1,0,0,1,0,0,2,27,12,2019
9,0,10631,1,1,379,2,1,0,0,0,0,0,1,26,12,2019


As we can see the following columns are suspected to have some weird values:
<br>
HOSTAGES,RANSOM,SUICIDE_ATTACK
<br>
we thought that maybe we should look at them a little before running the model:

In [64]:
df.HOSTAGES.describe()

count    114056.000000
mean          0.025900
std           0.158836
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: HOSTAGES, dtype: float64

In [65]:
df.RANSOM.describe()

count    114056.000000
mean          0.001640
std           0.040458
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: RANSOM, dtype: float64

In [66]:
df.SUICIDE_ATTACK.describe()

count    114056.000000
mean          0.039244
std           0.194176
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: SUICIDE_ATTACK, dtype: float64

As we can see there are not a lot 1 in here but there are plenty of zeros which means this data has little value to us.
<br>
Lets try to drop this columns and run the model once again.

In [67]:
df=df.drop(columns=["HOSTAGES","RANSOM","SUICIDE_ATTACK"])

In [68]:
df.head()

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,PROPERTY_DAMAGE,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,1,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,1,0,2,27,12,2019
3,80,2501,0,0,379,2,1,1,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,1,6,12,2019


In [69]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [70]:
target_column='INJURED'
X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'minmax'
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, scale_type)

X_test_minmax_scaled = scale_test_features(X_test, minmax_scaler)

classification_minmax_model = train_model(X_train_minmax_scaled, y_train)

df_minmax_res,y_predicted = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [71]:
df_minmax_res

Unnamed: 0,Actual,Predicted
16940,1,0
48817,0,1
102424,0,0
38280,1,1
29268,1,1
...,...,...
95853,0,0
22579,0,1
1430,0,0
18463,1,0


In [72]:
evaluate

0.5442284807616026

As we can see it got a little better (but not as much) so we decided to consult with our practitioner Idan Tubis about the matter.
<br>
We learned that Logistic regression is not a very "strong" (Or not as robust as other) machine learning algorithm and therefore the results for this question are probably the best we can get without corrupting the data (or delete some of it).