# Naive Bayes Python Implementation



In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [3]:
print('shape of train data:', train_data.shape)
print('shape of test data:', test_data.shape)

shape of train data: (712, 25)
shape of test data: (179, 25)


In [5]:
train_x = train_data.drop(columns = ['Survived'], axis = 1)
train_y = train_data.Survived

test_x = test_data.drop(columns = ['Survived'], axis = 1)
test_y = test_data.Survived

In [6]:
model = GaussianNB()
model.fit(train_x, train_y)

predict_train = model.predict(train_x)
print('Target on train data', predict_train)

Target on train data [1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1

In [8]:
accuracy_train = accuracy_score(train_y, predict_train)

print('accuracy score on train dataset:', accuracy_train)

accuracy score on train dataset: 0.44803370786516855


In [11]:
preds_test = model.predict(test_x)

print('accuracy_score on test dataset:', accuracy_score(test_y, preds_test))

accuracy_score on test dataset: 0.35195530726256985


## Poor Accuracy Scores ??

### Methods to Improve performance of this algorithm.

- If continuous features do not have normal distribution, we should use transformation or different methods to convert it in normal distribution. <br><br>

- If test data set has zero frequency issue, apply smoothing techniques “Laplace Correction” to predict the class of test data set. <br><br>

- Remove correlated features, as the highly correlated features are voted twice in the model and it can lead to over inflating importance.
Naive Bayes classifiers has limited options for parameter tuning like alpha=1 for smoothing, fit_prior=[True|False] to learn class prior probabilities or not and some other options (look at detail here). I would recommend to focus on your  pre-processing of data and the feature selection. <br><br>

- You might think to apply some classifier combination technique like ensembling, bagging and boosting but these methods would not help. Actually, “ensembling, boosting, bagging” won’t help since their purpose is to reduce variance. Naive Bayes has no variance to minimize.

In [12]:
train_x.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,28.5,7.2292,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,1,0,0
1,27.0,10.5,0,1,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,29.699118,16.1,0,0,1,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,29.699118,0.0,1,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,17.0,8.6625,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1


In [13]:
train_x.describe()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
count,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,...,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0
mean,29.674341,32.777006,0.238764,0.20927,0.551966,0.36236,0.63764,0.688202,0.230337,0.029494,...,0.751404,0.140449,0.087079,0.007022,0.005618,0.007022,0.001404,0.179775,0.082865,0.73736
std,12.986095,51.48184,0.426628,0.407073,0.497642,0.48102,0.48102,0.463553,0.421345,0.169307,...,0.432503,0.347697,0.282148,0.083564,0.074795,0.083564,0.037477,0.38427,0.275872,0.440378
min,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,7.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29.699118,14.45625,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,35.0,31.275,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,80.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


<div class='alert alert-success'>
    Study Feature Engineering for getting acquainted with many transformation techniques
</div>