In [76]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import make_scorer

NOTE: we changed the data in INJURED and FATALITIES to 0 and 1 with the usage of bins (1 means there was at least one or more, 0 means none).

# Functions that will be used

In [77]:
def load_dataset(file_name):
    df = pd.read_csv(file_name)
    df=df.drop(columns=["Unnamed: 0"])
    return df

In [78]:
def split_df_to_x_y(df,target_column):
    y=df[target_column]
    features = [col for col in df.columns]
    features.remove(target_column)
    X=df[features]
    return X,y

In [79]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)
    return X_train, X_test, y_train, y_test
    
    

In [80]:
def predict(classifier, X_test, y_test):
    y_pred=classifier.predict(X_test)
    resDF=pd.DataFrame({"Actual":y_test,"Predicted":y_pred})
    temp = pd.DataFrame({"Predicted":y_pred})
    return resDF,temp

In [81]:
def find_best_k_for_KNN(XTrain,yTrain,XTest):
    k_s=[]
    train_accuracies=[]
    test_accuracies=[]
    for k in range(3,25,2):
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(XTrain, yTrain)
        y_pred_train=clf.predict(XTrain)
        y_pred=clf.predict(XTest)
        k_s.append(k)
        train_accuracies.append(metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
        test_accuracies.append(metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

    return pd.DataFrame({"k":k_s,"train_accuracy":train_accuracies,"test_accuracy":test_accuracies})

# KNN -Flow:
## Question? will there be injured ? (Yes/No question)

### FIRST ATTEMPT

We didin't drop any data (just changed to true / false) if there were INJURED

In [82]:
df= load_dataset('df_to_model.csv')

In [83]:
df.head()

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


In [84]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,2000000],labels=[0,1])
df["INJURED"]=new_injured

In [85]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,2000000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [86]:
df.head(5)

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,1,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


In [87]:
target_column='INJURED'
X,y = split_df_to_x_y(df,target_column)
random_state=1
test_size=0.2
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_size, random_state)
df_best_k=find_best_k_for_KNN(XTrain,yTrain,XTest)

In [88]:
df_best_k

Unnamed: 0,k,train_accuracy,test_accuracy
0,3,0.803319,0.632255
1,5,0.756269,0.637208
2,7,0.732311,0.63905
3,9,0.717877,0.639839
4,11,0.706227,0.640584
5,13,0.699224,0.638524
6,15,0.694226,0.643389
7,17,0.688944,0.644047
8,19,0.684911,0.645231
9,21,0.680647,0.64488


In [89]:
df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]

Unnamed: 0,k,train_accuracy,test_accuracy
8,19,0.684911,0.645231


In [90]:
k = df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]['k'][8]
k

19

In [91]:
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(XTrain, yTrain)
y_pred=clf.predict(XTest)
print(metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))

print('Accuracy = ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

[[10471  3018]
 [ 5075  4248]]
Accuracy =  0.6452305803962827


In [92]:
evaluate_value = f1_score(yTest,y_pred,average='micro')
print(evaluate_value)

0.6452305803962827


After run NOTES:
<br>
1. KNN has preformed a lot better then Logistic regression when it comes to predicting INJURED in terror attack.
<br>
(Reminder Logistic regression got only 0.54 and KNN get 0.64 that a great improvment!)
<br>
2. KNN took a little longer to run then Logistic regression due to running a function named: find_best_k_for_KNN
<br>
which takes some time to get the best result. 

### SECOND ATTEMPT

We dropped the data that we discovered that is probably irelevent (HOSTAGES,RANSOM,SUICIDE_ATTACK).
<br>
The reason we think those columns are not helping our model is because we saw improvment when we dropped them at Logistic regression.

In [93]:
df=df.drop(columns=["HOSTAGES","RANSOM","SUICIDE_ATTACK"])

In [94]:
df.head()

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,PROPERTY_DAMAGE,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,1,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,1,10,12,2019
2,166,8676,1,0,182,0,1,1,0,2,27,12,2019
3,80,2501,0,0,379,2,1,1,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,1,6,12,2019


In [95]:
target_column='INJURED'
X,y = split_df_to_x_y(df,target_column)
random_state=1
test_size=0.2
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_size, random_state)
df_best_k=find_best_k_for_KNN(XTrain,yTrain,XTest)

In [96]:
df_best_k

Unnamed: 0,k,train_accuracy,test_accuracy
0,3,0.80333,0.631992
1,5,0.756346,0.637033
2,7,0.732202,0.638918
3,9,0.717866,0.639663
4,11,0.706205,0.640672
5,13,0.69918,0.638524
6,15,0.694237,0.64317
7,17,0.689064,0.644047
8,19,0.684757,0.645055
9,21,0.680615,0.644792


In [97]:
df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]

Unnamed: 0,k,train_accuracy,test_accuracy
8,19,0.684757,0.645055


In [98]:
k = df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]['k'][8]
k

19

In [99]:
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(XTrain, yTrain)
y_pred=clf.predict(XTest)
print(metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))

print('Accuracy = ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

[[10468  3021]
 [ 5076  4247]]
Accuracy =  0.6450552340873225


In [100]:
evaluate_value = f1_score(yTest,y_pred,average='micro')
print(evaluate_value)

0.6450552340873225


It appears that this time giving up some data didin't do us any good with the final result (0.64 compared to 0.64)

# KNN -Flow:
## Question? will there be fatalities ? (Yes/No question)

In [101]:
df= load_dataset('df_to_model.csv')

In [102]:
df.head()

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


In [103]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,2000000],labels=[0,1])
df["INJURED"]=new_injured

In [104]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [105]:
df.head()

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,0,0,1,0,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,0,0,0,1,10,12,2019
2,166,8676,1,0,182,0,1,0,0,1,0,0,2,27,12,2019
3,80,2501,0,0,379,2,1,0,0,1,0,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,0,0,0,1,6,12,2019


In [106]:
target_column='FATALITIES'
X,y = split_df_to_x_y(df,target_column)
random_state=1
test_size=0.2
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_size, random_state)
df_best_k=find_best_k_for_KNN(XTrain,yTrain,XTest)

In [107]:
df_best_k

Unnamed: 0,k,train_accuracy,test_accuracy
0,3,0.818728,0.65562
1,5,0.770374,0.658382
2,7,0.74657,0.660354
3,9,0.730525,0.659477
4,11,0.719938,0.657768
5,13,0.712178,0.657899
6,15,0.705942,0.656146
7,17,0.699071,0.655708
8,19,0.694183,0.657286
9,21,0.69084,0.654173


In [108]:
df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]

Unnamed: 0,k,train_accuracy,test_accuracy
2,7,0.74657,0.660354


In [109]:
k = df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]['k'][2]
k

7

In [110]:
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(XTrain, yTrain)
y_pred=clf.predict(XTest)
print(metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))

print('Accuracy = ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

[[7333 4109]
 [3639 7731]]
Accuracy =  0.6603541995440996


In [111]:
evaluate_value = f1_score(yTest,y_pred,average='micro')
print(evaluate_value)

0.6603541995440996


After run conclusion:
<br>
It appears that this model is not as good as the Logistic regression (0.77) when it comes to FATALITIES.
<br>
we're not sure if it will help but we'll try to get rid of the HOSTAGES,RANSOM,SUICIDE_ATTACK just to check if it helps.

### SECOND ATTEMPT

We dropped the data that we discovered that is probably irelevent (HOSTAGES,RANSOM,SUICIDE_ATTACK).

In [112]:
df= load_dataset('df_to_model.csv')

In [113]:
df=df.drop(columns=["HOSTAGES","RANSOM","SUICIDE_ATTACK"])

In [114]:
df.head()

Unnamed: 0,COUNTRY,CITY,FATALITIES,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,PROPERTY_DAMAGE,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
0,32,11429,0,0,143,3,1,1,0,3,12,12,2019
1,76,2512,0,1,379,2,1,0,0,1,10,12,2019
2,166,8676,2,0,182,0,1,1,0,2,27,12,2019
3,80,2501,0,0,379,2,1,1,0,1,26,12,2019
4,136,11964,0,0,379,2,1,0,0,1,6,12,2019


In [115]:
target_column='FATALITIES'
X,y = split_df_to_x_y(df,target_column)
random_state=1
test_size=0.2
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_size, random_state)
df_best_k=find_best_k_for_KNN(XTrain,yTrain,XTest)

In [116]:
df_best_k

Unnamed: 0,k,train_accuracy,test_accuracy
0,3,0.656098,0.488076
1,5,0.610309,0.497764
2,7,0.586625,0.499255
3,9,0.571227,0.504296
4,11,0.561604,0.506839
5,13,0.554634,0.507847
6,15,0.549538,0.508811
7,17,0.545526,0.508329
8,19,0.542995,0.51017
9,21,0.539838,0.510082


In [117]:
df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]

Unnamed: 0,k,train_accuracy,test_accuracy
10,23,0.537241,0.510828


In [118]:
k = df_best_k[df_best_k['test_accuracy']==df_best_k.test_accuracy.max()]['k'][10]
k

23

In [119]:
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(XTrain, yTrain)
y_pred=clf.predict(XTest)
print(metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))

print('Accuracy = ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

[[10177  1205    50 ...     0     0     0]
 [ 3557  1389    80 ...     0     0     0]
 [ 1368   500    77 ...     0     0     0]
 ...
 [    1     0     0 ...     0     0     0]
 [    1     0     0 ...     0     0     0]
 [    0     1     0 ...     0     0     0]]
Accuracy =  0.5108276345782922


In [120]:
evaluate_value = f1_score(yTest,y_pred,average='micro')
print(evaluate_value)

0.5108276345782922


Conclusions after dropping the data:
<br>
It appears that this time giving up some of the data hurt our model and gave us a lot worse results! 

### KNN MODEL FINAL CONCLUSIONS:

#### 1. without giving up any data we got better results when it comes to INJURED compared to Logistic regression

#### 2. without giving up any data we got worse results when it comes to FATALITIES compared to Logistic regression

#### 3. while giving up on some data we got similar results in INJURED compared to KNN without giving up on data (1)

#### 4. while giving up on some data we got worse results in FATALITIES compared to KNN without giving up on data (2)
