In [128]:
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

# Functions that will be used

In [129]:
def load_dataset(file_name):
    df = pd.read_csv(file_name)
    df=df.drop(columns=["Unnamed: 0"])
    return df

In [130]:
def split_df_to_x_y(df,target_column):
    y=df[target_column]
    features = [col for col in df.columns]
    features.remove(target_column)
    X=df[features]
    return X,y

In [131]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)
    return X_train, X_test, y_train, y_test


## NaiveBase- Flow:
## Question? Were there injured ? (Yes/No )

### Setting injured and fatalities to bins of [0,1] 


### Try the model with GaussianNB classifier

In [132]:
df = load_dataset('df_to_model.csv')

In [133]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [134]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [135]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [136]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [137]:
gnb = GaussianNB()

# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6568431896891851
Accuracy on test data=  0.6488251797299667


### Trying  the model with  BernoulliNB classifier
#### because  this classifier is suitable for discrete data and designed for binary/boolean features.

In [138]:
df = load_dataset('df_to_model.csv')

In [139]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [140]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [141]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [142]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [143]:
from sklearn.naive_bayes import BernoulliNB
gnb = BernoulliNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6593529437552058
Accuracy on test data=  0.6563650710152551


### Try with CategoricalNB classifier
#### because this classifier is suitable for classification with discrete features that are categorically distributed. The categories of each feature are drawn from a categorical distribution.

In [144]:
df = load_dataset('df_to_model.csv')

In [145]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [146]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [147]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [148]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [149]:

gnb = CategoricalNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7466134759545833
Accuracy on test data=  0.6939768542872172


## Question? Were there fatalities ? (Yes/No )

### Setting injured and fatalities to bins of [0,1] 

### Try the model with GaussianNB classifier

In [150]:
df = load_dataset('df_to_model.csv')

In [151]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [152]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [153]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [154]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [155]:
gnb = GaussianNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7644776642847749
Accuracy on test data=  0.7664387164650184


### Try the model with  BernoulliNB classifier
#### because  this classifier is suitable for discrete data and designed for binary/boolean features.

In [156]:
df = load_dataset('df_to_model.csv')

In [157]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [158]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [159]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [160]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [161]:

gnb = BernoulliNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7485314102845119
Accuracy on test data=  0.7539014553743644


### Try with CategoricalNB
#### because this classifier is suitable for classification with discrete features that are categorically distributed. The categories of each feature are drawn from a categorical distribution.

In [162]:
df = load_dataset('df_to_model.csv')

In [163]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [164]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [165]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [166]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [167]:

gnb = CategoricalNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7907369251676822
Accuracy on test data=  0.7735840785551464
