In [1]:
import pandas as pd
import numpy as np

In [2]:
# Pima Indians Diabetes Database
df = pd.read_csv('pima-indians-diabetes.csv', names = ['times pregnant', 'glucose', 'pressure',
                                                 'skin thickness (mm)', 'insulin (mu U/ml)', 
                                                  'mass index', 'pedigree', 'age', 'class'])
df.head()

Unnamed: 0,times pregnant,glucose,pressure,skin thickness (mm),insulin (mu U/ml),mass index,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
#splitting dataset into test and train randomly with given ratio
ratio = 0.8
msk = np.random.rand(len(df)) < ratio

train = df[msk]
train_x = train[train.columns[:-1]]
train_y = train[train.columns[-1]]

test = df[~msk]
test_x = test[test.columns[:-1]]
test_y = test[test.columns[-1]]

In [4]:
#calculate mean, variance prior probabilities for each class in test table
target = 'class'
mean = train.groupby([target]).mean()
var = train.groupby([target]).var()
prior = train[target].value_counts() / len(train)

In [5]:
def naive_bayes(test):
    """Return predictions for given test dataset"""    
    probabilities = {}
    for cl in prior.index:
        gauss = 1 / (np.sqrt(2 * np.pi * var.ix[cl])) *  \
                          np.exp(-0.5 *((test-mean.ix[cl])**2) / var.ix[cl])  
        likelihood = gauss.product(axis = 1)
        joint_prob = likelihood*prior[cl]
        probabilities[cl] = joint_prob
    predictions = list(pd.DataFrame(probabilities).idxmax(axis=1))
    return predictions  

In [6]:
my_pred = naive_bayes(test_x)

# Comparing with sklearn

In [7]:
from sklearn.naive_bayes import GaussianNB

In [8]:
model = GaussianNB()
model.fit(train_x, train_y)

sklearn_pred = model.predict(test_x)

print('Number of distinct values: ', sum(np.abs(sklearn_pred-my_pred)))

Number of distinct values:  0


# Accuracy

In [9]:
print('Accuracy: ', 1 - sum(np.abs(my_pred-test_y))/len(my_pred))

Accuracy:  0.75
