In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

## PREPROCESSING

## NAIVE BAYES

In [38]:
# Category features
continuous_features  = ['age', 'chol', 'oldpeak', 'thalch', 'trestbps']
categorical_features = ['ca', 'cp', 'restecg', 'slope', 'thal', 'sex', 'fbs', 'exang']
label = ['num']
updated_categorical_features = []

#read training data, normalize continuous features using (X - mean) / std
def readdata():
    df = pd.read_csv('heart_disease_uci.csv')
    
    # replacing nan value with std 
    for col in continuous_features:
        df[col] = df[col].fillna(df[col].mean())
        
    # normalization of continuous features
    for col in continuous_features:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
        
    # replacing nan value with mode 
    for col in categorical_features:
        mode_val = df[col].mode()[0]
        df.loc[df[col].isna(), col] = mode_val
    df[categorical_features] = df[categorical_features].astype("category")
    
    # replacing nan value with mode 
    for col in label:
        mode_val = df[col].mode()[0]
        df.loc[df[col].isna(), col] = mode_val
        
    # replacing 0 with negative, other values with positive
    df['num'] = df['num'].apply(lambda x: "positive" if x > 0 else "negative")
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=categorical_features, prefix=categorical_features, drop_first=True)
    for col in df.columns:
        if col not in continuous_features and col != 'num' and col != 'id' and col != 'dataset':
            updated_categorical_features.append(col)
    return df

In [39]:
# separate data into training and testing sets
def dataDivision():
    new_df = readdata()
    shuffled_df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)
    label_col = 'num'
    n = len(shuffled_df)

    # separating data for test and training
    train_length = int(0.8 * n)
    train_df = shuffled_df.iloc[:train_length]
    test_df = shuffled_df.iloc[train_length:]
    train_df_mod = train_df.drop(columns=['id','dataset'])
    test_df_mod = test_df.drop(columns=['id','dataset'])
    
    #print("Columns in DataFrame:", train_df_mod.columns.tolist())
    
    return train_df_mod,test_df_mod
    

In [40]:
# NB algorithm

#calculate categorical feature probability
def calcCatProb(train_df):
    catagorical_prob = {'positive': {}, 'negative': {}}
    classes = ['positive', 'negative']
    
    for cls in classes:
        class_df = train_df[train_df['num'] == cls]
        total_class = len(class_df)
        for col in train_df.columns:
            if col not in continuous_features and col != 'num':
                probability = class_df[col].sum()/total_class
                catagorical_prob[cls][col] = float(probability)
    #print(catagorical_prob)
    return catagorical_prob
            

#calculate continuous feature probability using Maximum Likelihood estimator for Gaussian Distribution
def calcGaussProb(train_df):
    gaussian_params = {'positive': {}, 'negative': {}}
    classes = ['positive', 'negative']
    

    for cls in classes:
        class_df = train_df[train_df['num'] == cls]
        for col in train_df.columns:
            if col in continuous_features:
                mean_value = class_df[col].mean()
                std_value = class_df[col].std()
                gaussian_params[cls][col] = (mean_value,std_value)
    #print("gaussian_params",gaussian_params )
    return gaussian_params



In [41]:
#naive bayes classifier
def naiveBayes(train_df, test_df):
    catProb = calcCatProb(train_df)
    gaussProb = calcGaussProb(train_df)
    classes = ['positive', 'negative']
    priorProb = {'positive': sum(train_df['num'] == 'positive') / len(train_df),
                 'negative': sum(train_df['num'] == 'negative') / len(train_df)
                }


    y_prediction = []
    
    for i in range((len(test_df))):
        sample = test_df.iloc[i]
        logProbs = {}
        for cls in classes:
            log_prob = math.log(priorProb[cls])
            
            for col in test_df.columns:
                if col not in continuous_features and col != 'num':
                    prob = catProb[cls].get(col, 1e-6)
                    if sample[col] == 1:
                        log_prob += math.log(prob)
                    else:
                        log_prob += math.log(1 - prob)
            
            for col in continuous_features:
                mean, std = gaussProb[cls][col]
                x = sample[col]
                prob = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((x - mean) ** 2) / (2 * std ** 2))
                log_prob += math.log(prob)

            logProbs[cls] = log_prob
            
        y_prediction.append(max(logProbs, key=logProbs.get))
    return y_prediction

In [42]:
#run naive bayes
#naiveBayes(train, test)
train_df,test_df = dataDivision()
y_prediction = naiveBayes(train_df, test_df)
y_true = test_df['num'].tolist()

In [43]:
# Print results of Naive Bayes classifier

def printConfusionMatrix(tp, fp, tn, fn):
    print("\n%15sActual" % "")
    print("%6s %7s %7s" % ("", "1", "0"))
    print("P%6s +--------+--------+" % "")
    print("r%6s | %-6s | %-6s |" % ("1", 'TP='+str(tp), 'FP='+str(fp)))
    print("e%6s +--------+--------+" % "")
    print("d%6s | %-6s | %-6s |" % ("0", 'FN='+str(fn), 'TN='+str(tn)))
    print(".%6s +--------+--------+\n" % "")

def getConfusionMatrix(y_true, y_pred):
    tp = fp = tn = fn = 0
    for t, p in zip(y_true, y_pred):
        if t == 'positive' and p == 'positive': tp += 1
        if t == 'negative' and p == 'positive': fp += 1
        if t == 'negative' and p == 'negative': tn += 1
        if t == 'positive' and p == 'negative': fn += 1
    return tp, fp, tn, fn

    return tp, fp, tn, fn

def getAccuracy(tp, fp, tn, fn):
    return (tp + tn) / (tp + tn + fp + fn)

def getPrecision(tp, fp, tn, fn):
    return tp / (tp + fp) if (tp + fp) != 0 else 0

def getRecall(tp, fp, tn, fn):
    return tp / (tp + fn) if (tp + fn) != 0 else 0

def getFScore(tp, fp, tn, fn):
    precision = getPrecision(tp, fp, tn, fn)
    recall = getRecall(tp, fp, tn, fn)
    return 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0

tp, fp, tn, fn = getConfusionMatrix(y_true, y_prediction)
printConfusionMatrix(tp, fp, tn, fn)


print('Accuracy:  %8.5f' % getAccuracy(tp, fp, tn, fn))
print('Precison:  %8.5f' % getPrecision(tp, fp, tn, fn))
print('Recall:    %8.5f' % getRecall(tp, fp, tn, fn))
print('F-Measure: %8.5f' % getFScore(tp, fp, tn, fn))


               Actual
             1       0
P       +--------+--------+
r     1 | TP=85  | FP=26  |
e       +--------+--------+
d     0 | FN=13  | TN=60  |
.       +--------+--------+

Accuracy:   0.78804
Precison:   0.76577
Recall:     0.86735
F-Measure:  0.81340
