In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
# Importing dataset and converting to dataframe
data = pd.read_csv('heart.csv', header=None)
df = pd.DataFrame(data)  # Dataframe

In [7]:
FP = 0
FN = 0
TN = 0
TP = 0

In [8]:
def nbkmh(train_index, test_index):
    # Extracting columns x and y separately for K-means and Naive Bayes classifiers
    x_kmeans = df.iloc[:, 0:5]
    x_kmeans = x_kmeans.drop(x_kmeans.columns[1:3], axis=1)
    x_kmeans = pd.DataFrame(scale(x_kmeans))

    x_naive = df.iloc[:, 0:13]

    y = df.iloc[:, 13]
    y = y - 1

    y_train = pd.Series(y.iloc[train_index])
    y_test = pd.Series(y.iloc[test_index])

    x_train_kmeans = x_kmeans.iloc[train_index, :]
    x_test_kmeans = x_kmeans.iloc[test_index, :]

    x_train_naive = x_naive.iloc[train_index, :]
    x_test_naive = x_naive.iloc[test_index, :]

    # K-means model for the processed data
    clusters = 5
    model_kmeans = KMeans(init='k-means++', n_clusters=clusters, n_init=10, random_state=10000)
    model_kmeans.fit(x_train_kmeans)
    kmean_predictions = model_kmeans.predict(x_train_kmeans)

    # Building dataset according to clusters
    x = [pd.DataFrame() for _ in range(0, clusters)]
    y = [pd.Series() for _ in range(0, clusters)]

    for kmean_prediction, i in zip(kmean_predictions, range(len(x_train_kmeans))):
        row_x = x_train_naive.iloc[i, :]
        row_y = pd.Series(y_train.iloc[i])
        index = int(kmean_prediction)
        x[index] = x[index].append(row_x, ignore_index=True)
        y[index] = y[index].append(row_y)

    # Applying Naive Bayes classifier
    clstr_n = [MultinomialNB(alpha=2, fit_prior=True) for _ in range(0, clusters)]

    for i in range(0, clusters):
        clstr_n[i].fit(x[i], y[i])

    # Calculating predictions for the testing based on the hybrid algorithm
    predicts = []
    c = 0
    for i in range(len(x_test_kmeans)):
        prediction = model_kmeans.predict(x_test_kmeans.iloc[i, :].values.reshape(1, -1))
        prediction = int(prediction)
        pred_naive = clstr_n[prediction].predict(x_test_naive.iloc[i, :].values.reshape(1, -1))
        predicts.append(pred_naive)
        if pred_naive == y_test.iloc[i]:
            c += 1

    print((c * 100.0) / len(x_test_kmeans))

    # Metrics
    predicts = np.array(predicts)
    cm = metrics.confusion_matrix(y_test, predicts) / len(y_test)

    global FP
    global FN
    global TN
    global TP

    FP += cm[0][0]
    FN += cm[1][0]
    TN += cm[0][1]
    TP += cm[1][1]

    return ((c * 100.0) / len(x_test_kmeans))

def main():
    scores = []
    kf = KFold(n_splits=10)
    for (train_index, test_index), i in zip(kf.split(df), range(0, 10)):
        print("Iteration " + str(i + 1) + " : ")
        scores.append(nbkmh(train_index, test_index))
    print("\n10 Fold Accuracy", np.array(scores).mean())
    print("FP", FP * 10)
    print("FN", FN * 10)
    print("TN", TN * 10)
    print("TP", TP * 10)

if __name__ == '__main__':
    main()

Iteration 1 : 
78.94736842105263
Iteration 2 : 
77.19298245614036
Iteration 3 : 
89.47368421052632
Iteration 4 : 
87.71929824561404
Iteration 5 : 
85.96491228070175
Iteration 6 : 
82.45614035087719
Iteration 7 : 
70.17543859649123
Iteration 8 : 
83.92857142857143
Iteration 9 : 
85.71428571428571
Iteration 10 : 
80.35714285714286

10 Fold Accuracy 82.19298245614036
FP 47.08959899749374
FN 10.216165413533833
TN 7.590852130325814
TP 35.10338345864662
