In [238]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score

# Functions that will be used

In [239]:
def load_dataset(file_name):
    df = pd.read_csv(file_name)
    df=df.drop(columns=["Unnamed: 0"])
    return df

In [240]:
def split_df_to_x_y(df,target_column):
    y=df[target_column]
    features = [col for col in df.columns]
    features.remove(target_column)
    X=df[features]
    return X,y

In [241]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)
    return X_train, X_test, y_train, y_test
    
    

In [242]:
def scale_features(X_train, scale_type):
    
    if scale_type == 'minmax' :
        
        scaler = MinMaxScaler(feature_range=(0, 1))
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_train)
        return scaler,X_train_scaled

    
    else:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_train)
        return scaler,X_train_scaled


In [243]:
def scale_test_features(X_test, scaler):
    return scaler.transform(X_test)

In [244]:
def train_model(X_train, y_train):
    trained_model= LogisticRegression().fit(X_train,y_train)
    return trained_model

In [245]:
#def predict(trained_model, X_test):
 #   predicted_vals = train_model.predict(X_test)
  #  return predicted_vals

In [246]:
def predict(classifier, X_test, y_test):
    y_pred=classifier.predict(X_test)
    resDF=pd.DataFrame({"Actual":y_test,"Predicted":y_pred})
    temp = pd.DataFrame({"Predicted":y_pred})
    return resDF,temp

In [247]:
def evaluate_performance(y_test,y_predicted):
    evaluate_value = f1_score(y_test,y_predicted)
    return evaluate_value


# Logistic Regression -Flow: 
## Question? will there be Fatalities? Yes/No

In [248]:
df= load_dataset('df_to_model.csv')

In [249]:
df.head(35)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019
5,143,17004,2,4,317,0,1,0,0,1,0,0,4,9,12,2019
6,136,22246,1,0,143,1,1,0,0,0,0,0,2,7,12,2019
7,76,19277,0,1,379,3,1,0,0,1,0,0,4,28,12,2019
8,76,17330,0,0,22,3,1,0,0,1,0,0,2,27,12,2019
9,0,10631,1,1,379,2,1,0,0,0,0,0,1,26,12,2019


In [250]:
#new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,1,20,50,100,2000],labels=[0,1,2,3,4,5])
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,2000],labels=[0,1])

In [251]:
df["FATALITIES"]=new_fatalities

In [252]:
df.FATALITIES.describe()

count     114056
unique         2
top            0
freq       57121
Name: FATALITIES, dtype: int64

In [253]:
df.head(35)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,1,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019
5,143,17004,1,4,317,0,1,0,0,1,0,0,4,9,12,2019
6,136,22246,1,0,143,1,1,0,0,0,0,0,2,7,12,2019
7,76,19277,0,1,379,3,1,0,0,1,0,0,4,28,12,2019
8,76,17330,0,0,22,3,1,0,0,1,0,0,2,27,12,2019
9,0,10631,1,1,379,2,1,0,0,0,0,0,1,26,12,2019


### standard

In [265]:
target_column='FATALITIES'

X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'standard'
standard_scaler, X_train_standard_scaled = scale_features(X_train, scale_type)

X_test_standard_scaled = scale_test_features(X_test, standard_scaler)

classification_standard_model=train_model(X_train_standard_scaled, y_train)

df_standard_res,y_predicted = predict(classification_standard_model, X_test_standard_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [261]:
df_standard_res

Unnamed: 0,Actual,Predicted
16940,0,0
48817,0,0
102424,0,0
38280,0,1
29268,0,0
...,...,...
95853,0,0
22579,1,1
1430,0,1
18463,0,0


In [262]:
evaluate

0.7753419852683269

### minmax

In [263]:
target_column='FATALITIES'
X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'minmax'
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, scale_type)

X_test_minmax_scaled = scale_test_features(X_test, minmax_scaler)

classification_minmax_model = train_model(X_train_minmax_scaled, y_train)

df_minmax_res,y_predicted = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [259]:
df_minmax_res

Unnamed: 0,Actual,Predicted
16940,0,0
48817,0,0
102424,0,0
38280,0,1
29268,0,0
...,...,...
95853,0,0
22579,1,1
1430,0,1
18463,0,0


In [260]:
evaluate

0.7753419852683269

# Logistic Regression -Flow:
## Question? will there be Injured? Yes/No

In [266]:
df= load_dataset('df_to_model.csv')

In [267]:
df.head(35)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019
5,143,17004,2,4,317,0,1,0,0,1,0,0,4,9,12,2019
6,136,22246,1,0,143,1,1,0,0,0,0,0,2,7,12,2019
7,76,19277,0,1,379,3,1,0,0,1,0,0,4,28,12,2019
8,76,17330,0,0,22,3,1,0,0,1,0,0,2,27,12,2019
9,0,10631,1,1,379,2,1,0,0,0,0,0,1,26,12,2019


In [268]:
df.INJURED.describe()

count    114056.00000
mean          3.43241
std          37.77107
min           0.00000
25%           0.00000
50%           0.00000
75%           2.00000
max       10878.00000
Name: INJURED, dtype: float64

In [269]:
df[df['INJURED']>=10000]

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
72278,185,20148,1385,10878,382,0,1,1,0,1,1,1,3,5,9,2001


In [270]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [271]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114056 entries, 0 to 114055
Data columns (total 16 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   COUNTRY                    114056 non-null  int64   
 1   CITY                       114056 non-null  int64   
 2   FATALITIES                 114056 non-null  int64   
 3   INJURED                    114056 non-null  category
 4   TARGET_TYPE                114056 non-null  int64   
 5   ATTACK_TYPE                114056 non-null  int64   
 6   SUCCESSFUL_ATTACK          114056 non-null  int64   
 7   HOSTAGES                   114056 non-null  int64   
 8   RANSOM                     114056 non-null  int64   
 9   PROPERTY_DAMAGE            114056 non-null  int64   
 10  SUICIDE_ATTACK             114056 non-null  int64   
 11  PART_OF_MULTIPLE_INCIDENT  114056 non-null  int64   
 12  TYPE_OF_WEAPON_LIST        114056 non-null  int64   
 13  SUB_TYPE_OF_WE

### standard

In [272]:
target_column='INJURED'

X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'standard'
standard_scaler, X_train_standard_scaled = scale_features(X_train, scale_type)

X_test_standard_scaled = scale_test_features(X_test, standard_scaler)

classification_standard_model=train_model(X_train_standard_scaled, y_train)

df_standard_res,y_predicted = predict(classification_standard_model, X_test_standard_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [273]:
df_standard_res

Unnamed: 0,Actual,Predicted
16940,1,0
48817,0,1
102424,0,0
38280,1,0
29268,1,1
...,...,...
95853,0,0
22579,0,1
1430,0,0
18463,1,0


In [274]:
evaluate

0.5211326081541144

### minmax

In [275]:
target_column='INJURED'
X,y= split_df_to_x_y(df,target_column)

test_ratio, rand_state =0.3,11
X_train, X_test, y_train, y_test =split_to_train_and_test(X,y,test_ratio,rand_state)

scale_type= 'minmax'
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, scale_type)

X_test_minmax_scaled = scale_test_features(X_test, scaler)

classification_minmax_model = train_model(X_train_minmax_scaled, y_train)

df_minmax_res,y_predicted = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

evaluate = evaluate_performance(y_test,y_predicted)

In [276]:
df_minmax_res

Unnamed: 0,Actual,Predicted
16940,1,0
48817,0,1
102424,0,0
38280,1,0
29268,1,1
...,...,...
95853,0,0
22579,0,1
1430,0,0
18463,1,0


In [277]:
evaluate

0.5056853942013175

# KNN -Flow:
## Question? Estimation of how many injured ? (None,Low,Medium,High, ... )