In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Dataset Preperation

We need to transform categorical data into one-hot or dummy encoding form. That provides better results because these allow the representation of categorical data to be more expressive. Many machine learning algorithms cannot work with categorical data directly. The categories must be converted into numbers. This is required for both input and output variables that are categorical. 

In fact, the categorical data is already numerical but for the sake of curiosity, I want to measure using dummy values how much effective in a situation like this. The reason why I wonder is thinking about that the zero values may not affect the weights of the equation. I mean, for instance, before applying sigmoid function to linear equation (ùúΩ'x), the weigths of the 0 values are lost actually. Of course, I am not sure if it really work that way to classify but I want to see the results and share with you. 

### Directly Using The Dataset

In [None]:
# Reminder: you can use 'ctrl + /' for multicomment lines.

df = pd.read_csv("heart.csv")
final_df = df
final_df = (final_df - np.min(final_df)) / (np.max(final_df) - np.min(final_df)).values
final_df

### Using One-Hot Encoding  Even For Binary Categories

In [None]:
# Reminder: you can use 'ctrl + /' for multicomment lines.

# df = pd.read_csv("heart.csv")

# s_dummies = pd.get_dummies(df['sex'], prefix='sex')
# fbs_dummies = pd.get_dummies(df['fbs'], prefix='fbs')
# restecg_dummies = pd.get_dummies(df['restecg'], prefix='restecg')
# exang_dummies = pd.get_dummies(df['exang'], prefix='exang')
# cp_dummies = pd.get_dummies(df['cp'], prefix = "cp")
# thal_dummies = pd.get_dummies(df['thal'], prefix = "thal")
# slope_dummies = pd.get_dummies(df['slope'], prefix = "slope")

# final_df = df.drop(columns = ['sex', 'fbs', 'restecg', 'exang', 'cp', 'thal', 'slope'])
# final_df = (final_df - np.min(final_df)) / (np.max(final_df) - np.min(final_df)).values
# frames = [final_df, s_dummies, fbs_dummies, restecg_dummies, exang_dummies, cp_dummies, thal_dummies, slope_dummies]
# final_df = pd.concat(frames, axis = 1)
# final_df

Dimension of the dataset is greatly increased. This must be a fair reason not using this method, at least binary categories.

### Using One-Hot Encoding Without Binary Categories

In [None]:
# Reminder: you can use 'ctrl + /' for multicomment lines.

# df = pd.read_csv("heart.csv")

# cp_dummies = pd.get_dummies(df['cp'], prefix = "cp")
# thal_dummies = pd.get_dummies(df['thal'], prefix = "thal")
# slope_dummies = pd.get_dummies(df['slope'], prefix = "slope")

# final_df = df.drop(columns = ['cp', 'thal', 'slope'])
# final_df = (final_df - np.min(final_df)) / (np.max(final_df) - np.min(final_df)).values
# frames = [final_df, cp_dummies, thal_dummies, slope_dummies]
# final_df = pd.concat(frames, axis = 1)
# final_df

### Seperate Dataset Into 80% Train and 20% Test 

In [None]:
y = final_df.target.values
x = final_df.drop(['target'], axis = 1)

#Seperate
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)
#Transpose
x_train = x_train.T
y_train = y_train.T
x_test = x_test.T
y_test = y_test.T

## Sklearn Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)

## Sklearn Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(x_train.T, y_train.T)

# Comparing Algorithms

# Confusion Matrix

A confusion matrix is a summary of prediction results on a classification problem. The number of correct and incorrect predictions are summarized with count values and broken down by each class. This is the key to the confusion matrix.


 Reference: https://machinelearningmastery.com/confusion-matrix-machine-learning/

In [None]:
# Overall performance of model
def accuracy(tn, fp, fn, tp):
    return (tp + tn) / (tp + tn + fp + fn)

# How accurate the positive predictions are
def precision(tp, fp):
    return tp / (tp + fp)

# Coverage of actual positive sample
def recall_sensitivity(tp, fn):
    return tp / (tp + fn)

# Coverage of actual negative sample
def specificity(tn, fp):
    return tn / (tn + fp)

# Hybrid metric useful for unbalanced classes
def f1_score(tp, fp, fn):
    return 2 * tp / (2 * tp + fp + fn)

In [None]:
y_head_lr = lr.predict(x_test.T)
y_head_nb = nb.predict(x_test.T)

from sklearn.metrics import confusion_matrix

cm_lr = confusion_matrix(y_test, y_head_lr)
cm_nb = confusion_matrix(y_test, y_head_nb)

tn_lr, fp_lr, fn_lr, tp_lr = cm_lr.flatten()
tn_nb, fp_nb, fn_nb, tp_nb = cm_nb.flatten()


accuracies = {}
precisions = {}
recall_sensitivities = {}
specificities = {}
F1_scores = {}

accuracies['Logistic_Regression'] = accuracy(tn_lr, fp_lr, fn_lr, tp_lr)
precisions['Logistic_Regression'] = precision(tp_lr, fp_lr)
recall_sensitivities['Logistic_Regression'] = recall_sensitivity(tp_lr, fn_lr)
specificities['Logistic_Regression'] = specificity(tn_lr, fp_lr)
F1_scores['Logistic_Regression'] = f1_score(tp_lr, fp_lr, fn_lr)

accuracies['Naive_Bayes'] = accuracy(tn_nb, fp_nb, fn_nb, tp_nb)
precisions['Naive_Bayes'] = precision(tp_nb, fp_nb)
recall_sensitivities['Naive_Bayes'] = recall_sensitivity(tp_nb, fn_nb)
specificities['Naive_Bayes'] = specificity(tn_nb, fp_nb)
F1_scores['Naive_Bayes'] = f1_score(tp_nb, fp_nb, fn_nb)


plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrixes",fontsize=24, x=0.37)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix")
sns.heatmap(cm_lr,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,2)
plt.title("Naive Bayes Confusion Matrix")
sns.heatmap(cm_nb,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.show()

## Accuracy

In [None]:
print('Logistic Regression: ' + str(accuracies['Logistic_Regression']*100))
print('Naive Bayes: ' + str(accuracies['Naive_Bayes']*100))

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))
plt.show()

plt.figure(figsize=(24,5))

plt.suptitle("Accuracy Comparison",fontsize=24, x=0.37)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")

plt.subplot(2,3,1)
plt.title("Logistic Regression ")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[83.61, 86.89, 86.89])

plt.subplot(2,3,2)
plt.title("Naive Bayes")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[85.25, 85.25, 86.89])

plt.show()

Directly Using The Dataset: 
    
    Sklearn Logistic Regression Accuracy: 83.61%
    Sklearn Naive Bayes Accuracy: 85.25%

Using Dummy Variables Even For Binary Categories:
    
    Sklearn Logistic Regression Accuracy: 86.89%
    Sklearn Naive Bayes Accuracy: 85.25%

Using Dummy Variables Without Binary Categories:
    
    Sklearn Logistic Regression Accuracy: 86.89%
    Sklearn Naive Bayes Accuracy: 86.89%

## Precision

In [None]:
print('Logistic Regression: ' + str(precisions['Logistic_Regression'] * 100))
print('Naive Bayes: ' + str(precisions['Naive_Bayes'] * 100))

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Precision %")
plt.xlabel("Algorithms")
sns.barplot(x=list(precisions.keys()), y=list(precisions.values()))
plt.show()

plt.figure(figsize=(24,5))

plt.suptitle("Precision Comparison",fontsize=24, x=0.37)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.yticks(np.arange(0,100,10))
plt.ylabel("Precision %")
plt.xlabel("Algorithms")

plt.subplot(2,3,1)
plt.title("Logistic Regression")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[81.58, 88.24, 88.24])

plt.subplot(2,3,2)
plt.title("Naive Bayes")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[83.78, 85.71, 88.24])

plt.show()

Directly Using The Dataset: 
    
    Sklearn Logistic Regression Precision: 81.58%
    Sklearn Naive Bayes Precision: 83.78%

Using Dummy Variables Even For Binary Categories:
    
    Sklearn Logistic Regression Precision: 88.24%
    Sklearn Naive Bayes Precision: 85.71%

Using Dummy Variables Without Binary Categories:
    
    Sklearn Logistic Regression Precision: 88.24%
    Sklearn Naive Bayes Precision: 88.24%

## Recall Sensitivity

In [None]:
print('Logistic Regression: ' + str(recall_sensitivities['Logistic_Regression']*100))
print('Naive Bayes: ' + str(recall_sensitivities['Naive_Bayes']*100))

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Recall Sensitivities")
plt.xlabel("Algorithms")
sns.barplot(x=list(recall_sensitivities.keys()), y=list(recall_sensitivities.values()))
plt.show()

plt.figure(figsize=(24,5))

plt.suptitle("Recall Sensitivities Comparison",fontsize=24, x=0.37)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.yticks(np.arange(0,100,10))
plt.ylabel("Recall Sensitivities %")
plt.xlabel("Algorithms")

plt.subplot(2,3,1)
plt.title("Logistic Regression ")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[91.18, 88.24, 88.24])

plt.subplot(2,3,2)
plt.title("Naive Bayes")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[91.18, 88.24, 88.24])

plt.show()

Directly Using The Dataset: 
    
    Sklearn Logistic Regression Recall Sensitivity: 91.18%
    Sklearn Naive Bayes Recall Sensitivity: 91.18%

Using Dummy Variables Even For Binary Categories:
    
    Sklearn Logistic Regression Recall Sensitivity: 88.24%
    Sklearn Naive Bayes Recall Sensitivity: 88.24%

Using Dummy Variables Without Binary Categories:
    
    Sklearn Logistic Regression Recall Sensitivity: 88.24%
    Sklearn Naive Bayes Recall Sensitivity: 88.24%

## Specificity

In [None]:
print('Logistic Regression: ' + str(specificities['Logistic_Regression']*100))
print('Naive Bayes: ' + str(specificities['Naive_Bayes']*100))

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Specificities")
plt.xlabel("Algorithms")
sns.barplot(x=list(specificities.keys()), y=list(specificities.values()))
plt.show()

plt.figure(figsize=(24,5))

plt.suptitle("Specificity Comparison",fontsize=24, x=0.37)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.yticks(np.arange(0,100,10))
plt.ylabel("Specificity %")
plt.xlabel("Algorithms")

plt.subplot(2,3,1)
plt.title("Logistic Regression")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[74.07, 85.19, 85.19])

plt.subplot(2,3,2)
plt.title("Naive Bayes")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[77.78, 81.48, 85.19])

plt.show()

Directly Using The Dataset: 
    
    Sklearn Logistic Regression Specifificity: 74.07%
    Sklearn Naive Bayes Specifificity: 77.78%

Using Dummy Variables Even For Binary Categories:
    
    Sklearn Logistic Regression Specifificity: 85.19%
    Sklearn Naive Bayes Specifificity: 81.48%

Using Dummy Variables Without Binary Categories:
    
    Sklearn Logistic Regression Specifificity: 85.19%
    Sklearn Naive Bayes Specifificity: 85.19%

## F1 Score

In [None]:
print('Logistic Regression: ' + str(F1_scores['Logistic_Regression']*100))
print('Naive Bayes: ' + str(F1_scores['Naive_Bayes']*100))

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("F1 Scores")
plt.xlabel("Algorithms")
sns.barplot(x=list(F1_scores.keys()), y=list(F1_scores.values()))
plt.show()

plt.figure(figsize=(24,5))

plt.suptitle("F1 Score Comparison",fontsize=24, x=0.37)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.yticks(np.arange(0,100,10))
plt.ylabel("F1 Score %")
plt.xlabel("Algorithms")

plt.subplot(2,3,1)
plt.title("Logistic Regression")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[86.11, 88.24, 88.24])

plt.subplot(2,3,2)
plt.title("Naive Bayes")
sns.barplot(x=['Directly', 'With Binary', 'Without Binary'], y=[87.32, 86.96, 88.24])

plt.show()

Directly Using The Dataset: 
    
    Sklearn Logistic Regression F1 Score: 86.11%
    Sklearn Naive Bayes F1 Score: 87.32%

Using Dummy Variables Even For Binary Categories:
    
    Sklearn Logistic Regression F1 Score: 88.24%
    Sklearn Naive Bayes F1 Score: 86.96%

Using Dummy Variables Without Binary Categories:
    
    Sklearn Logistic Regression F1 Score: 88.24%
    Sklearn Naive Bayes F1 Score: 88.24%