In [1]:
# Naive Bayes
import pandas as pd
import numpy as np
import math 
import random
import csv
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv("D:\COURSES\Fall 2020 courses\CSC5825 machine learning\homework\HW2\heart.csv")
dataset.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
#As Naive Bayes deals with only continous features, creating a subset with just continous features.
heart = dataset[['age','trestbps','chol','thalach','oldpeak','target']]
heart.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target
0,63,145,233,150,2.3,1
1,37,130,250,187,3.5,1
2,41,130,204,172,1.4,1
3,56,120,236,178,0.8,1
4,57,120,354,163,0.6,1


In [4]:
# Normalizing data using function F(X) = (X 􀀀 mean)=std
heart[['age','trestbps','chol','thalach','oldpeak']] = heart[['age','trestbps','chol','thalach','oldpeak']].apply(lambda data: abs(data - np.mean(data)) / np.std(data), axis = 0)

In [5]:
heart.head(5)

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target
0,0.952197,0.763956,0.256334,0.015443,1.087338,1
1,1.915313,0.092738,0.072199,1.633471,2.122573,1
2,1.474158,0.092738,0.816773,0.977514,0.310912,1
3,0.180175,0.663867,0.198357,1.239897,0.206705,1
4,0.290464,0.663867,2.08205,0.583939,0.379244,1


In [6]:
heart.shape

(303, 6)

In [7]:
print(list(heart))

['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'target']


In [8]:
# Separate by class
# creating a dictionary of classes with mean and std of each columns.
def separateUnderclass(data):
    sepclass = {}
    #calcualting mean and std for each attributes excluding target 
    sepclass['1'] = data.loc[data['target'] == 1, data.columns != 'target']
    sepclass['0'] = data.loc[data['target'] == 0, data.columns != 'target']
    return sepclass

In [9]:
#summarizing under each class
def summarizeUnderclass(data):
    info = {}
    sepclass = separateUnderclass(data)
    for classValue, instances in sepclass.items():
        info[classValue] = list(zip(instances.mean(axis =0), instances.std(axis = 0)))
    return info

In [10]:
# calculating gaussian probability density function
def calculateGp(x, mean, stddev):
    expo = math.exp(-(math.pow(x - mean,2) / (2 * math.pow(stddev,2))))
    return(1 / (math.sqrt(2*math.pi) * stddev)) * expo
#     return abs(x - mean) / stddev
#     if stddev != 0:
#         return abs(x - mean) / stddev
#     return mean

In [11]:
# calulating class probabilities
def calculateCp(info,test):
    probabilities = {}
    for classValue, classSummaries in info.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean,stddev = classSummaries[i]
            x = test[i]
            probabilities[classValue] = probabilities[classValue] * calculateGp(x, mean, stddev)
    return probabilities

In [12]:
# making prediction
def predict_probablity(info,test):
    probabilities = calculateCp(info,test)
    # return the class with max probability
    return max(probabilities, key = probabilities.get)

In [13]:
#return predictions 
def getPredictions(summaries,test):
    predictions = []
    for i in test.values.tolist():
        result = predict_probablity(summaries,i)
        predictions.append(int(result))
    return predictions

In [14]:
# Accuracy score
def accuracy_rate(test, predictions):
    correct = 0
    test = test.values.tolist()
    for i in range(len(test)):
        if test[i][-1] == float(predictions[i]):
            correct += 1
    return (correct / float(len(test))) * 100.0

In [15]:
train, test = np.split(heart.sample(frac=1), [int(.8*len(heart))])
print('heart_dataset = {0} , train = {1}, test = {2}'.format(len(heart), len(train), len(test)))
summaries = summarizeUnderclass(train)
predictions = getPredictions(summaries,test)
print("predictions = {0} ".format(predictions))
accuracy = accuracy_rate(test, predictions)
print('Accuracy = {0}%'.format(accuracy))

heart_dataset = 303 , train = 242, test = 61
predictions = [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1] 
Accuracy = 73.77049180327869%


In [16]:
# CONFUSION METRIx
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
#target = test['target'].to_list()
confusion_matrix(predictions, test['target'])

array([[19,  4],
       [12, 26]], dtype=int64)

In [17]:
accuracy = accuracy_score(predictions, test['target'])
print('Accuracy = {0}%'.format(accuracy))

Accuracy = 0.7377049180327869%


In [18]:
precision = precision_score(predictions, test['target'])
print('Precision = {0}%'.format(precision))

Precision = 0.8666666666666667%


In [19]:
recall = recall_score(predictions, test['target'])
print('recall = {0}%'.format(recall))

recall = 0.6842105263157895%


In [20]:
f1 = f1_score(predictions, test['target'])
print('f1 = {0}%'.format(f1))

f1 = 0.7647058823529413%
