# Naive Bayes Method

An implementation that deals with conditional probability. Specifically the Bayes theorem that states P(H|E) = (P(E|H)*P(H))/P(E) where H is the hypothesis and E is the event. Allows for deduction of events given previous observations.

Contains:
    Class A:
        P(E) = Count(E)/(Count(E)+ Count(Not E))
    Class B:
        P(H|E) = Count(H and E)/Count(E)

Can be used as a classifier, it is easy to implement but reliant on the quality of the data, i.e an event may not be present ect.                          

In [55]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sns.set()

In [56]:
diamonds = sns.load_dataset('diamonds')
CD = diamonds.copy()
CD.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
27203,2.05,Premium,H,SI1,60.2,58.0,17521,8.25,8.22,4.96
44137,0.57,Good,D,SI1,59.3,61.0,1561,5.34,5.38,3.18
10514,0.9,Ideal,F,VS2,62.6,56.0,4801,6.16,6.23,3.88
27357,2.16,Premium,I,SI1,58.5,60.0,17934,8.54,8.49,4.98
44331,0.33,Premium,I,VS1,62.7,58.0,521,4.4,4.43,2.77
21162,1.53,Premium,J,VVS1,61.6,58.0,9322,7.4,7.37,4.55
44197,0.51,Ideal,D,SI1,61.7,55.0,1569,5.11,5.16,3.17
32174,0.39,Very Good,G,VS2,62.4,56.0,787,4.64,4.68,2.91
3937,0.9,Ideal,G,SI2,62.4,56.0,3500,6.14,6.19,3.85
23022,0.35,Premium,I,VS2,61.2,58.0,630,4.57,4.55,2.79


In [57]:
CD['cut'] = CD['cut'].map({value: column for column, value in enumerate(pd.unique(CD['cut']))})
CD['color'] = CD['color'].map({value: column for column, value in enumerate(pd.unique(CD['color']))})
CD['clarity'] = CD['clarity'].map({value: column for column, value in enumerate(pd.unique(CD['clarity']))})
CD['volume'] = CD['x']*CD['y']*CD['z']
CD.drop(CD.loc[CD['volume']==0].index, inplace = True)
CD.drop(['x','y','z'], axis = 1, inplace = True)
CD.sample(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,volume
15227,1.01,2,6,3,67.0,57.0,6113,156.957346
27806,0.3,0,5,4,61.9,57.0,650,49.52654
15314,1.32,2,2,2,62.6,62.0,6147,211.500112
30464,0.31,3,6,1,63.4,58.0,732,51.678
15083,1.17,3,6,0,60.9,60.0,6062,187.258176
45393,0.54,1,4,3,58.9,59.0,1667,89.99046
19137,1.26,1,6,1,59.5,59.0,7903,212.425516
24946,1.11,1,6,5,62.0,54.0,13405,183.963936
7805,1.29,1,1,0,61.1,62.0,4291,206.9508
34067,0.3,1,6,3,62.0,62.0,851,47.796672


In [58]:
data = CD.drop('cut', axis=1)
labels = CD['cut']

data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2)

In [59]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(data_train, labels_train)

predicted = model.predict(data_test)

print(np.mean(predicted == labels_test))

0.3821402077151335


In [73]:
titanic = sns.load_dataset('titanic')
CT = titanic.copy().drop(['embark_town','alive','deck','sex','embarked','adult_male','class'], axis = 1)
CT['who'] = CT['who'].map({value:column for column, value in enumerate(pd.unique(CT['who']))})
CT['alone'] = CT['alone'].map({value:column for column, value in enumerate(pd.unique(CT['alone']))})
CT.dropna(inplace=True)
CT.sample(10)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,who,alone
283,1,3,19.0,0,0,8.05,0,1
450,0,2,36.0,1,2,27.75,0,0
361,0,2,29.0,1,0,27.7208,0,0
720,1,2,6.0,0,1,33.0,2,0
809,1,1,33.0,1,0,53.1,1,0
204,1,3,18.0,0,0,8.05,0,1
99,0,2,34.0,1,0,26.0,0,0
305,1,1,0.92,1,2,151.55,2,0
677,1,3,18.0,0,0,9.8417,1,1
873,0,3,47.0,0,0,9.0,0,1


In [74]:
data = CT.drop('survived', axis=1)
labels = CT['survived']

data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2)

In [75]:
model = MultinomialNB().fit(data_train, labels_train)

predicted = model.predict(data_test)

print(np.mean(predicted == labels_test))

0.6573426573426573
