In [1]:
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

# Functions that will be used

In [2]:
def load_dataset(file_name):
    df = pd.read_csv(file_name)
    df=df.drop(columns=["Unnamed: 0"])
    return df

In [3]:
def split_df_to_x_y(df,target_column):
    y=df[target_column]
    features = [col for col in df.columns]
    features.remove(target_column)
    X=df[features]
    return X,y

In [4]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)
    return X_train, X_test, y_train, y_test


# NaiveBase- Flow:
## Question: Will there be injured ? (Yes/No )

Setting injured and fatalities to bins of [0,1] 


### Trying the model with GaussianNB classifier

In [5]:
df = load_dataset('df_to_model.csv')

In [6]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [7]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [8]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [9]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [10]:
gnb = GaussianNB()

# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6568431896891851
Accuracy on test data=  0.6488251797299667


As we can see the results are OK (comapred to other models) but not great (0.64 is pretty common at Linear regression and KNN)

### Trying  the model with  BernoulliNB classifier
We read a little about this classifier online.
<br>
it seems it is more suitable for discrete data and designed for binary/boolean features.

In [11]:
df = load_dataset('df_to_model.csv')

In [12]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [13]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [14]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [15]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [16]:
from sklearn.naive_bayes import BernoulliNB
gnb = BernoulliNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.6593529437552058
Accuracy on test data=  0.6563650710152551


We can see the results are a little better (almost 0.66 compared to earlier 0.64)

### Trying the model with CategoricalNB classifier
we found out that this classifier is suitable for classification with discrete features that are categorically distributed.
<br>
The categories of each feature are drawn from a categorical distribution.

In [17]:
df = load_dataset('df_to_model.csv')

In [18]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [19]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [20]:
target_column="INJURED"
X,y = split_df_to_x_y(df,target_column)

In [21]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [22]:

gnb = CategoricalNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7466134759545833
Accuracy on test data=  0.6939768542872172


Now thats a lot better (0.69 compared to 0.66 and 0.64)

# NaiveBase- Flow:

## Question: Will there be fatalities ? (Yes/No )

Setting injured and fatalities to bins of [0,1] 

### Trying the model with GaussianNB classifier

In [23]:
df = load_dataset('df_to_model.csv')

In [24]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [25]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [26]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [27]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [28]:
gnb = GaussianNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7644776642847749
Accuracy on test data=  0.7664387164650184


Now this result is amazing ! 0.77 is pretty accurate.

### Trying  the model with  BernoulliNB classifier
We read a little about this classifier online.
<br>
it seems it is more suitable for discrete data and designed for binary/boolean features.

In [29]:
df = load_dataset('df_to_model.csv')

In [30]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [31]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [32]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [33]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [34]:

gnb = BernoulliNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7485314102845119
Accuracy on test data=  0.7539014553743644


We can see that this time the BernoulliNB got inferior results (0.75) comapred to GaussianNB (0.77)
<br>
But the results here are still pretty accurate

### Trying the model with CategoricalNB classifier
we found out that this classifier is suitable for classification with discrete features that are categorically distributed.
<br>
The categories of each feature are drawn from a categorical distribution.

In [35]:
df = load_dataset('df_to_model.csv')

In [36]:
new_fatalities=pd.cut(df.FATALITIES,bins=[-1,0,200000],labels=[0,1])
df["FATALITIES"]=new_fatalities

In [37]:
new_injured=pd.cut(df.INJURED,bins=[-1,0,200000],labels=[0,1])
df["INJURED"]=new_injured

In [38]:
target_column="FATALITIES"
X,y = split_df_to_x_y(df,target_column)

In [39]:
rand_state=1
test_ratio=0.20
XTrain, XTest, yTrain, yTest=split_to_train_and_test(X, y, test_ratio, rand_state)

In [40]:
gnb = CategoricalNB()


# Train classifier
gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

Accuracy on Train data=  0.7907369251676822
Accuracy on test data=  0.7735840785551464


CategoricalNB got similar results to GaussianNB (0.77 both)

# Final cocnlusions:
### 1. The NaiveBayes has amazing results to each one of our questions.
### 2. in general it feels like CategoricalNB is getting the best results (for our data).