In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


###  Conditional probability
Naive Bayes classification model is based on bayes theoram of conditional probability.

The formula for which is: 

$p(A|B) = \frac{p(B|A).p(A)}{p(B)}$

The above is read as the **_probability of event A occuring given that event B has alreay occured_**

### Naive Bayes Classifier

Naive Bayes works on the same formula with just one assumption; It assumes that all the input variables are indepandent to each other (hence the term _Naive_ in the name)

For a given set of inputs $\{x_1, x_2, x_3\}$, the probability that the output is $y_1$ is given by the following formula:

$p(y_1|{x_1,x_2,x_2}) = \frac{p(x_1|y_1).p(x_2|y_1).p(x_3|y_1).p(y_1)}{p(x_1).p(x_2).p(x_3)}$


In [28]:
data = pd.read_csv('heart-disease-data/heart.csv')

In [29]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


For our algorithm to work we only need discrete varaibles so we will drop any continous varaibles

In [30]:
data.drop(["age", "trestbps", "chol", "thalach", "oldpeak", "slope"],axis = 1 ,inplace=True)

In [31]:
data.head()

Unnamed: 0,sex,cp,fbs,restecg,exang,ca,thal,target
0,1,3,1,0,0,0,1,1
1,1,2,0,1,0,0,2,1
2,0,1,0,0,0,0,2,1
3,1,1,0,1,0,0,2,1
4,0,0,0,1,1,0,2,1


In [32]:
X = data[data.keys()[:-1]]
y = data[data.keys()[-1]]

Split the data into test and train samples

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [65]:
data_train = pd.concat([X_train, y_train],axis = 1)
data_test = pd.concat([X_test, y_test],axis = 1)

In [66]:
data_test.head()

Unnamed: 0,sex,cp,fbs,restecg,exang,ca,thal,target
179,1,0,0,0,1,1,1,0
228,1,3,0,0,0,0,3,0
111,1,2,1,1,0,1,3,1
246,0,0,0,0,1,2,3,0
60,0,2,1,0,0,1,2,1


Lets build a helper function to calculate the conditional probabilities of individual inputs

In [67]:
#Calcualting probabilites for inputs independantly

def get_probabilities_for_inputs(n, column_name, data_frame):
    
    temp = data_frame[column_name] #isolate targetted column
    temp = temp.value_counts() #get counts of occurences of each input variable
    
    return (temp/n) #return probiblity of occurence by dividing with total no. of data points
    

In [68]:
#calculating conditional probability

def get_conditional_probabilities(data_frame, n,target, given):
    
    focused_data = data[[target, given]] #isolate target column an dfocus input column

    targets_unique = data[target].unique()#list of unique outputs in data
    inputs_unique = data[given].unique()
    
    groups = focused_data.groupby(by = [given, target]).size().reset_index()
    groups[0] = groups[0]/ n
    
    
    for targets in targets_unique:
        current_target_length = len(focused_data[focused_data[target] == targets])
        groups[0] = np.where(groups[target] == targets, groups[0].div(current_target_length),groups[0])
    
    return groups  
    
    
    
            
            

In [69]:
def calculate_probabilities(data):
    #splititng input data
    x = data[data.keys()[:-1]]
    y = data[data.keys()[-1]]
    target = y.name
    
    #get length of dataframe
    n = len(data)
    
    #get probabilities for each individual input and output
    f_in = lambda lst: get_probabilities_for_inputs(n, lst, x)
    input_probablities = list(map(f_in,x.keys()))
    
    output_probabilities = get_probabilities_for_inputs(n ,target, y.to_frame())
    
    #get conditional probabilities for every input against every output
    f1 = lambda lst: get_conditional_probabilities(data, n, target,lst)
    conditional_probabilities = list(map(f1, data.keys()[:-1]))
    
    return input_probablities, output_probabilities, conditional_probabilities
    
    
    
    

The _calculate_probabilities_ function will actually calcualte all the important probabilities for us that we will need to make predictions on test data

In [70]:
def naive_bayes_calculator(target_values, input_values, in_prob, out_prob, cond_prob):
    
    target_values.sort()#sort the target values to assure ascending order
    classes = [] #initialise empty probabilites list
    
    for target_value in target_values:
        num = 1 #initilaise numerator
        den = 1 #initialise denominator

        #calculate denominator according to the formula
        for i,x in enumerate(input_values):
            den *= in_prob[i][x]

        #calculate numerator according to the formula    
        for i, x_1 in enumerate(input_values):
            temp_df = cond_prob[i]
            num *= temp_df[(temp_df.iloc[:,0] == x_1) & (temp_df.iloc[:,1] == target_value)][0].values[0]
        num *= out_prob[target_value]

        final_probability = (num/den) #final conditional probability value
        
        classes.append(final_probability) #append probability for current class in a list
        
    return (classes.index(max(classes)), classes)
        
    

In [71]:
in_prob, out_prob, cond_prob = calculate_probabilities(data_train)

In [72]:
naive_bayes_calculator([1,0], [1,1,0,2,1,3,3],in_prob,out_prob,cond_prob)

(0, [1.4299839158448542e-17, 1.2448594626538668e-19])

In [73]:
def naive_bayes_predictor(test_data, outputs, in_prob, out_prob, cond_prob):
    
    final_predictions = [] #initialise empty list to store test predictions
    
    for row in test_data:
        #get prediction for current data
        predicted_class, probabilities = naive_bayes_calculator(outputs, row, in_prob, out_prob, cond_prob)
        #append to list
        final_predictions.append(predicted_class)
        
        
    return final_predictions
    
    
    

In [74]:
test_data_as_list = X_test.values.tolist()
unique_targets = y_test.unique().tolist()

In [75]:
predicted_y = naive_bayes_predictor(test_data_as_list,unique_targets,in_prob,out_prob,cond_prob)

In [82]:
print("Accuracy:", (np.count_nonzero(y_test == predicted_y)/len(y_test)) *100)

Accuracy: 77.41935483870968
