Resources:
- https://www.edureka.co/blog/naive-bayes-tutorial/
- Stack overflow by Andy Hayden : https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas

In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
data = pd.read_csv("./diabetes.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
def splitData(data, ratio=0.8): 
    '''Split data based on ratio. Default ratio is 80% train, 20% test'''
    msk = np.random.rand(len(data)) < ratio
    train_set = data[msk]
    test_set = data[~msk]
    return [train_set, test_set]

In [4]:
def separateByClass(dataset, classCol, classVal):
    '''Get data based on a particular outcome (classVal)'''
    return dataset[dataset[classCol] == classVal]

In [5]:
def calc_likelihood(x, mean, std):
    '''Find gaussian probability'''
    top = math.exp((-1/2) * math.pow(((x - mean) / std), 2))
    bottom = std * math.sqrt(2 * math.pi)
    return top / bottom

In [6]:
def calc_prior(df, classCol, classVal):
    '''Calculate prior of a particular outcome based on given dataframe'''
    total = df.shape[0]
    return len(df[df[classCol] == classVal]) / total

In [7]:
def listOfClasses(df, classCol):
    '''Get a list of all outcome'''
    return df[classCol].unique()

In [50]:
def calc_stats(df, classCol):
    '''Get mean and standard deviation of a dataframe'''
    return [df.mean(axis=0), df.std(axis=0)]

In [41]:
def naive_bayes_posteriors(df, classCol):
    '''Calculate all posteriors for all outcome'''
    classList = listOfClasses(df, classCol) # Get list of different outcomes
    posteriors = [] # Store posterior for each outcome
    
    for eachClass in classList:  # Looping through each outcome
        prior = calc_prior(df, classCol, eachClass) # Get prior of a particular outcome
        class_df = separateByClass(df, classCol, eachClass) # Only data of a particular outcome
        means, stds = calc_stats(class_df, classCol, eachClass) # Mean and variance for data of a particular outcome
        features_name = class_df.drop(columns=classCol).columns # Lists of all features
        
        for _, row in class_df.iterrows(): # Go through each row in the data of a particular outcome
            f_products = 1; 
            for feature in features_name:
                print(feature, row[feature], means[feature], stds[feature], calc_likelihood(row[feature], means[feature], stds[feature]))
                # Use gaussian with value of a feature, its mean and stdev, then multiply with other features
                f_products *= calc_likelihood(row[feature], means[feature], stds[feature]) 
#                 print(posterior)
#             print(f_products)
            print("f_Products: ", f_products)
            posterior = prior * f_products # Get posterior 
            ### Problem: it comes out 0 ??????
            print(posterior)

In [42]:
classList = listOfClasses(data, "Outcome")
train_set, test_set = splitData(data)
naive_bayes_posteriors(train_set, "Outcome")

TypeError: calc_stats() takes 2 positional arguments but 3 were given

In [51]:
class_df = separateByClass(data, "Outcome", 0)
means, stds = calc_stats(class_df, "Outcome")

Pregnancies                   3.298000
Glucose                     109.980000
BloodPressure                68.184000
SkinThickness                19.664000
Insulin                      68.792000
BMI                          30.304200
DiabetesPedigreeFunction      0.429734
Age                          31.190000
Outcome                       0.000000
dtype: float64 Pregnancies                  3.017185
Glucose                     26.141200
BloodPressure               18.063075
SkinThickness               14.889947
Insulin                     98.865289
BMI                          7.689855
DiabetesPedigreeFunction     0.299085
Age                         11.667655
Outcome                      0.000000
dtype: float64
