In [170]:
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

### Functions

In [171]:
def load_dataset(file_name):
    df = pd.read_csv(file_name)
    df=df.drop(columns=["Unnamed: 0"])
    return df

In [172]:
def split_df_to_x_y(df,target_column):
    y=df[target_column]
    features = [col for col in df.columns]
    features.remove(target_column)
    X=df[features]
    return X,y

In [173]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)
    return X_train, X_test, y_train, y_test


## NaiveBase Flow

### Injured

In [174]:
df = load_dataset('df_to_model.csv')

In [175]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [176]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [177]:
gnb = GaussianNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.48438253474201043
Accuracy on test data=  0.48058039628265825


### With bins

In [194]:
df = load_dataset('df_to_model.csv')

In [195]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,1,10,50,100,200000],labels=[0,1,2,3,4,5])
df["INJURED"]=new_injured

In [196]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,1,10,50,100,200000],labels=[0,1,2,3,4,5])
df["FATALITIES"]=new_fatalities

In [197]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [198]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [199]:
gnb = GaussianNB()

# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6175090964885362
Accuracy on test data=  0.6149395055234087


### bins [0,1]

In [200]:
df = load_dataset('df_to_model.csv')

In [201]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [202]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [203]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [204]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [205]:
gnb = GaussianNB()

# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6568431896891851
Accuracy on test data=  0.6488251797299667


### try with BernoulliNB

In [206]:
df = load_dataset('df_to_model.csv')

In [207]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [208]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [209]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [210]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [211]:
from sklearn.naive_bayes import BernoulliNB
gnb = BernoulliNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6593529437552058
Accuracy on test data=  0.6563650710152551


### try with CategoricalNB

In [256]:
df = load_dataset('df_to_model.csv')

In [257]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [258]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [259]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [260]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [261]:
from sklearn.naive_bayes import CategoricalNB
gnb = CategoricalNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7466134759545833
Accuracy on test data=  0.6939768542872172


### fatalities

In [212]:
df = load_dataset('df_to_model.csv')

In [213]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [214]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [215]:
gnb = GaussianNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.5683442198939108
Accuracy on test data=  0.5655795195511134


### with bins

In [216]:
df = load_dataset('df_to_model.csv')

In [217]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,1,10,50,100,200000],labels=[0,1,2,3,4,5])
df["FATALITIES"]=new_fatalities

In [218]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,1,10,50,100,200000],labels=[0,1,2,3,4,5])
df["INJURED"]=new_injured

In [219]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [220]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [221]:
gnb = GaussianNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6219039060102582
Accuracy on test data=  0.6219533578818166


### bins[0,1]

In [232]:
df = load_dataset('df_to_model.csv')

In [233]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [234]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [235]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [236]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [237]:
gnb = GaussianNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7644776642847749
Accuracy on test data=  0.7664387164650184


### try with BernoulliNB

In [238]:
df = load_dataset('df_to_model.csv')

In [239]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [240]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [241]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [242]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [243]:
from sklearn.naive_bayes import BernoulliNB
gnb = BernoulliNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7485314102845119
Accuracy on test data=  0.7539014553743644


### try with CategoricalNB

In [244]:
df = load_dataset('df_to_model.csv')

In [245]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [246]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [247]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [248]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [249]:
from sklearn.naive_bayes import CategoricalNB
gnb = CategoricalNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7907369251676822
Accuracy on test data=  0.7735840785551464


### second try with CategoricalNB, with different bins

In [250]:
df = load_dataset('df_to_model.csv')

In [251]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,1,10,50,100,2000],labels=[0,1,2,3,4,5])
df["FATALITIES"]=new_fatalities

In [252]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,1,10,50,100,20000],labels=[0,1,2,3,4,5])
df["INJURED"]=new_injured

In [253]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [254]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [255]:
from sklearn.naive_bayes import CategoricalNB
gnb = CategoricalNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6717373197141729
Accuracy on test data=  0.63392074346835
