In [109]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score

# Functions that will be used

In [110]:
def load_dataset(file_name, target_column):
    df = pd.read_csv(file_name)
    GTD_ID=df.GTD_ID
    df=df.drop(columns=["Unnamed: 0","Unnamed: 0.1","GTD_ID"])
    y=df[target_column]
    features = [col for col in df.columns]
    features.remove(target_column)
    X=df[features]
    y=df[target_column]
    test_ratio, rand_state = 0.2, 42
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)
    return X_train, X_test, y_train, y_test,GTD_ID,df

In [111]:
def train_model(X_train, y_train):
    trained_model= linear_model.LinearRegression()
    trained_model.fit(X_train,y_train)
    return trained_model

In [112]:
def predict(trained_model, X_test):
    predicted_vals = train_model.predict(X_test)
    return predicted_vals

In [113]:
def evaluate_performance(y_test,y_predicted):
    evaluate_value = r2_score(y_test,y_predicted)
    return evaluate_value


In [114]:
X_train, X_test, y_train, y_test,GTD_ID,df= load_dataset('df_to_model.csv', 'FATALITIES')
train_model=train_model(X_train, y_train)
y_predicted=predict(train_model, X_test)
evaluate_value=evaluate_performance(y_test,y_predicted)

In [116]:
X_train

Unnamed: 0,COUNTRY,CITY,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
9263,136,5722,0,143,1,1,0,0,0,0,0,2,27,12,2017
43823,0,16069,0,143,2,0,0,0,0,1,0,1,29,9,2013
75572,87,1367,0,191,2,1,0,0,1,0,0,1,29,2,2002
66755,76,2747,0,379,2,1,0,0,0,0,1,1,26,9,2008
54283,0,25706,0,317,2,1,0,0,1,0,0,1,10,5,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,179,12163,0,317,2,1,0,0,1,0,0,1,26,12,1978
103694,11,26866,0,379,0,1,0,0,0,0,0,2,1,5,1988
860,179,26811,2,379,2,1,0,0,1,0,1,1,18,10,2019
15795,130,26632,0,444,2,0,0,0,0,0,0,1,13,12,2016


In [45]:
X_test

Unnamed: 0,COUNTRY,CITY,INJURED,TARGET_TYPE,ATTACK_TYPE,SUCCESSFUL_ATTACK,HOSTAGES,RANSOM,PROPERTY_DAMAGE,SUICIDE_ATTACK,PART_OF_MULTIPLE_INCIDENT,TYPE_OF_WEAPON_LIST,SUB_TYPE_OF_WEAPON,MONTH,YEAR
20081,33,8844,0,234,0,1,0,0,0,0,0,2,20,5,2016
3064,130,5976,10,404,2,1,0,0,1,0,0,1,29,4,2019
54207,80,24290,1,485,2,1,0,0,1,0,0,1,26,5,2012
47732,130,12382,0,234,3,1,0,0,1,0,0,2,27,4,2013
632,76,12028,1,207,1,0,0,0,0,0,0,2,27,11,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81416,173,8392,0,234,1,1,0,0,0,0,0,2,27,2,1997
68039,76,12279,0,517,2,1,0,0,1,0,0,1,3,5,2008
21152,0,25417,4,317,0,1,0,0,0,0,1,2,27,3,2016
22328,25,4967,2,517,2,1,0,0,0,0,1,1,6,2,2016


In [46]:
y_train

9263      2
43823     0
75572     2
66755     0
54283     4
         ..
119879    0
103694    5
860       2
15795     0
121958    1
Name: FATALITIES, Length: 98953, dtype: int64

In [47]:
y_test

20081     1
3064      1
54207     0
47732     1
632       0
         ..
81416     1
68039     0
21152     5
22328     0
100753    0
Name: FATALITIES, Length: 24739, dtype: int64

In [48]:
evaluate_value

0.45306558048596945

In [49]:
y_predicted

array([2.47255953, 2.09230137, 1.16133116, ..., 2.81704235, 1.464204  ,
       1.44191008])

In [132]:
cut=pd.cut(df.FATALITIES,bins=[-1,0,1,20,50,100,2000],labels=['a','b','c','d','e','f'])
#cuts=pd.cut(df.FATALITIES, bins)

In [134]:
df[df["FATALITIES"]>=1300]
cut[76047]
cut

0         a
1         a
2         c
3         a
4         a
         ..
123687    a
123688    a
123689    a
123690    a
123691    a
Name: FATALITIES, Length: 123692, dtype: category
Categories (6, object): ['a' < 'b' < 'c' < 'd' < 'e' < 'f']

In [140]:
df["FATALITIES"]=cut

In [139]:
df.FATALITIES.describe()

count     123692
unique         6
top            a
freq       64941
Name: FATALITIES, dtype: object