In [192]:
#import libraries 

import pandas as pd 
import numpy as np
import scipy.stats as s

In [193]:
#read csv file

raw_data = pd.read_csv("heart.csv")

In [194]:
raw_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Now we'll find the unique values of the target variable.

In [195]:
number_of_unique_values = raw_data['target'].unique()

print(number_of_unique_values)

[1 0]


It is a thumb rule that 75% data should be training data and 25% data should be testing data.

In [196]:
training_data = raw_data.iloc[0:int(0.75*len(raw_data))]

training_data = pd.DataFrame(training_data)

testing_data = raw_data.iloc[int(0.75*len(raw_data)):]

testing_data = pd.DataFrame(testing_data)

actual_class = testing_data['target']

Columns having 0 or minimum variance are removed from the data in PCA because the probability of these columns will always be 1 and taking log of them will result in 0 which means they're not participating in determining the posterior probability.

In [197]:
feature_variances = training_data.var()

feature_variances.sort_values(inplace=True)

print(feature_variances)

fbs            0.127909
exang          0.186659
target         0.199407
sex            0.232973
restecg        0.259561
thal           0.311450
slope          0.399789
ca             0.893026
cp             1.020116
oldpeak        1.260725
age           85.141710
trestbps     277.419438
thalach      437.177966
chol        2763.491092
dtype: float64


In [198]:
training_data = training_data.drop(labels = ['fbs','exang','sex','restecg','thal','slope','ca','cp','oldpeak'],axis = 1)

Now we'll determine the mean vector and covariance matrix for the formula of Joint Gaussian Distribution.

In [199]:
def mean_vector_and_cov_mat(target_value):
    
    target_data = training_data[training_data['target'] == target_value]
    
    dropped_target_data = target_data.drop(['target'],axis=1)
    
    target_data_array = np.array(dropped_target_data)
    
    mean_vector = np.mean(target_data_array,axis=0)
    
    cov_mat = dropped_target_data.cov()
    
    return [mean_vector,cov_mat]

Natural parameters include mean vector, covariance matrix and prior probabilities.

In [200]:
natural_parameters = list(map(lambda x: mean_vector_and_cov_mat(x),number_of_unique_values))

print(natural_parameters)

[[array([ 52.4969697 , 129.3030303 , 242.23030303, 158.46666667]),                  age    trestbps         chol     thalach
age        91.214930   42.421656   131.525092  -96.288211
trestbps   42.421656  261.456393    80.783444    8.693089
chol      131.525092   80.783444  2867.910052   14.843089
thalach   -96.288211    8.693089    14.843089  367.652846], [array([ 56.88709677, 133.        , 251.56451613, 140.90322581]),                 age    trestbps         chol     thalach
age       55.970650   45.311475    95.572977   -9.896351
trestbps  45.311475  314.786885   237.000000  -35.344262
chol      95.572977  237.000000  2463.692491   21.596510
thalach   -9.896351  -35.344262    21.596510  403.367530]]


In [201]:
prior_class_probabilities = list(map(lambda target_value: len(training_data[training_data['target'] == target_value])/len(training_data),
                                     number_of_unique_values))

print(prior_class_probabilities)

[0.7268722466960352, 0.27312775330396477]


In [202]:
D = dict(zip(number_of_unique_values,natural_parameters))

In [203]:
for k,p_cap in zip(D.keys(),prior_class_probabilities):
    
    D[k].append(p_cap)

In [204]:
D[1]

[array([ 52.4969697 , 129.3030303 , 242.23030303, 158.46666667]),
                  age    trestbps         chol     thalach
 age        91.214930   42.421656   131.525092  -96.288211
 trestbps   42.421656  261.456393    80.783444    8.693089
 chol      131.525092   80.783444  2867.910052   14.843089
 thalach   -96.288211    8.693089    14.843089  367.652846,
 0.7268722466960352]

In [205]:
pooled_cov = 0

for i in [0,1]:
    
    pooled_cov += D[i][1]*(len(training_data[training_data['target'] == i])-1)
    
pooled_cov = pooled_cov/(len(training_data)-2)


In [206]:
def Naive_Bayes_Classifier_Result(heart_features):
    
    numerators = list(map(lambda target_value: ((s.multivariate_normal.pdf(x=heart_features,
                                                          mean=D[target_value][0],cov=pooled_cov))*D[target_value][2]),D.keys()))
    
    numerators = np.array(numerators)
    
    posterior_class_probabilities = list(map(lambda x: numerators[x]/np.sum(numerators),[0,1]))
    
    return (np.argmax(posterior_class_probabilities))

In [207]:
testing_data.drop(labels=['target'],axis=1,inplace=True)

testing_data = testing_data.drop(labels = ['fbs','exang','sex','restecg','thal','slope','ca','cp','oldpeak'],axis = 1)

In [208]:
true_values = 0

false_values = 0

for i in range(0,len(testing_data)):
    
    predicted_class = Naive_Bayes_Classifier_Result(testing_data.iloc[i,:])
    
    if predicted_class == actual_class.iloc[i]:
        
        true_values += 1
        
    else:
        
        false_values += 1

In [209]:
accuracy = true_values/(true_values+false_values)*100

In [210]:
accuracy

65.78947368421053