In [None]:
#naive bayes
#eg: 1.Prob of getting a queen in pack of cards.
#Total cards = 52, Queens= 4
#P(queen) = 4/52 = 1/13
#2 Pick a random card, you know it is a DIAMOND. Now what is the probabilty of the card being a QUEEN? - CONDITIONAL PROBABILITY
#Total diamonds = 13
#Queen = 1
#P(Queen/Diamond) = 1/13
#Conditional Prob : P(A/B) = Prob of event A knowing that event B has already occurred 
#P(A/B) = P(B/A) * P(A) / P(B).      Bayes Theorem  
#used in email spam detection, character recognition, weather prediction, face detected, news article characterisation




In [8]:
import pandas as pd 
df = pd.read_csv('datasets/titanic.csv')
df.head()


Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [9]:
#Data Exploration: Delete unwanted variables 
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [11]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [4]:
#inputs.Sex = inputs.Sex.map({'male': 1, 'female': 2})

In [12]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0


In [13]:
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


In [14]:
inputs.drop(['Sex','male'],axis='columns',inplace=True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1


In [15]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [16]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [18]:
#filling the nan values with mean of the column values
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.3)

In [20]:
#using gaussian naive bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [21]:
#training the model
model.fit(X_train,y_train)

GaussianNB()

In [22]:
#measure score
model.score(X_test,y_test)


0.7835820895522388

In [23]:
#check first 10 samples of x test.
X_test[0:10]


Unnamed: 0,Pclass,Age,Fare,female
37,3,21.0,8.05,0
809,1,33.0,53.1,1
295,1,29.699118,27.7208,0
468,3,29.699118,7.725,0
140,3,29.699118,15.2458,1
353,3,25.0,17.8,0
595,3,36.0,24.15,0
205,3,2.0,10.4625,1
309,1,30.0,56.9292,1
174,1,56.0,30.6958,0


In [24]:
##check first 10 samples of x test.
y_test[0:10]

37     0
809    1
295    0
468    0
140    0
353    0
595    0
205    0
309    1
174    0
Name: Survived, dtype: int64

In [25]:
#compare with y_test
model.predict(X_test[0:10])

array([0, 1, 0, 0, 0, 0, 0, 1, 1, 0])

In [26]:
#check prob of survival
model.predict_proba(X_test[:10])

array([[0.96200076, 0.03799924],
       [0.04507335, 0.95492665],
       [0.72084017, 0.27915983],
       [0.96778953, 0.03221047],
       [0.54705745, 0.45294255],
       [0.96633839, 0.03366161],
       [0.96954568, 0.03045432],
       [0.32527694, 0.67472306],
       [0.03631858, 0.96368142],
       [0.69414811, 0.30585189]])

In [27]:
#Calculate the score using cross validation
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)

array([0.776     , 0.704     , 0.728     , 0.77419355, 0.72580645])