<h2 style='color:purple' align='center'>Naive Bayes Tutorial Part 1: Predicting survival from titanic crash</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


<h3>Data Preparation</h3>

In [3]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [4]:
inputs = df.drop('Survived', axis=1)
target = df.Survived

In [5]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [6]:
target[target == np.NaN]

Series([], Name: Survived, dtype: int64)

In [7]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.Sex = inputs.Sex.map({'male': 1, 'female': 2})
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,1,22.0,7.25
1,1,2,38.0,71.2833
2,3,2,26.0,7.925
3,1,2,35.0,53.1
4,3,1,35.0,8.05


<h3>Model training and evaluation</h3>

In [8]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB

cross_val_score(GaussianNB(), inputs, target, cv=3)

array([0.75420875, 0.78451178, 0.77777778])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.3)
model = GaussianNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7798507462686567

In [10]:
from sklearn.metrics import confusion_matrix

y_predicted = model.predict(X_test)
cm = confusion_matrix(y_test, y_predicted)
cm

array([[150,  18],
       [ 41,  59]], dtype=int64)