## Import Libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from torchmetrics.functional.classification import binary_accuracy, binary_auroc, binary_f1_score, binary_precision, binary_recall
import tensorflow as tf
import time
from keras.models import Sequential
from keras.layers import Dense, InputLayer, Dropout, InputSpec
from keras.utils import *
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

## Read Data

In [3]:
data = pd.read_csv('heart_disease_data.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [4]:
data = data.drop_duplicates(ignore_index=True)
data = data.fillna(0)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    float64
 12  thal      303 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB


In [6]:
data['num'].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [7]:
data['num'].value_counts()

num
0    164
1     55
2     36
3     35
4     13
Name: count, dtype: int64

In [20]:
combine = SMOTEENN()
X_combine, y_combine = combine.fit_resample(data.drop('num', axis=1), data['num'])
y_combine.value_counts()

num
4    119
2    104
3     97
1     69
0     39
Name: count, dtype: int64

In [11]:
smote = SMOTE(sampling_strategy='minority')
X_smote, y_smote = smote.fit_resample(data.drop('num', axis=1), data['num'])

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [12]:
y_smote.value_counts()

num
0    164
4    164
1     55
2     36
3     35
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import OneHotEncoder

# Create an instance of OneHotEncoder
onehot_encoder = OneHotEncoder()

# One-hot encode the 'num' column
onehot_encoded = onehot_encoder.fit_transform(data[['num']]).toarray()

onehot_encoded

array([[1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [7]:
onehot_encoded.shape

(303, 5)

## Data Preprocessing

In [8]:
scaler = MinMaxScaler()
scaler.fit(data)

# Transform the DataFrame to obtain the normalized data
data_normalized = scaler.transform(data)

In [9]:
X = data_normalized[:, :-1]
y = data_normalized[:, -1]

In [10]:
y

array([0.  , 0.5 , 0.25, 0.  , 0.  , 0.  , 0.75, 0.  , 0.5 , 0.25, 0.  ,
       0.  , 0.5 , 0.  , 0.  , 0.  , 0.25, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.25, 0.75, 1.  , 0.  , 0.  , 0.  , 0.  , 0.75, 0.  , 0.5 , 0.25,
       0.  , 0.  , 0.  , 0.75, 0.25, 0.75, 0.  , 1.  , 0.  , 0.  , 0.  ,
       0.25, 1.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.25,
       0.25, 0.25, 0.25, 0.  , 0.  , 0.5 , 0.  , 0.25, 0.  , 0.5 , 0.5 ,
       0.25, 0.  , 0.5 , 0.25, 0.  , 0.75, 0.25, 0.25, 0.25, 0.  , 0.25,
       0.  , 0.  , 0.75, 0.  , 0.  , 0.  , 0.75, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.75, 0.  , 0.  , 0.  , 0.25, 0.5 , 0.75, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.75, 0.  , 0.5 , 0.25, 0.5 , 0.75,
       0.25, 0.25, 0.  , 0.5 , 0.5 , 0.  , 0.  , 0.  , 0.75, 0.5 , 0.75,
       1.  , 0.  , 0.75, 0.25, 0.  , 0.75, 0.75, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 1.  , 0.75, 0.25, 0.  , 0.  , 0.25, 0.  ,
       0.25, 0.  , 0.25, 1.  , 0.  , 0.  , 0.  , 0.

In [11]:
y = onehot_encoded

## Modelling

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [13]:
class MultiChannelWeightedDropout(tf.keras.Model):
    def __init__(self, out, p=0.5):
        super(MultiChannelWeightedDropout, self).__init__()

        self.in_layer = tf.keras.layers.Dense(3, activation='softmax')

        self.dropout1 = tf.keras.layers.Dropout(p)
        self.dropout2 = tf.keras.layers.Dropout(p)
        self.dropout3 = tf.keras.layers.Dropout(p)

        self.out_layer = tf.keras.layers.Dense(out, activation='sigmoid')

    def call(self, x, training=False):
        x = self.in_layer(x)
        weights = x

        # Apply weighted dropout
        channel1 = self.dropout1(x[:, 0], training=training) * weights[:, 0]
        channel2 = self.dropout2(x[:, 1], training=training) * weights[:, 1]
        channel3 = self.dropout3(x[:, 2], training=training) * weights[:, 2]

        x = tf.stack([channel1, channel2, channel3], axis=1)
        x = self.out_layer(x)
        return x

In [14]:
model = Sequential([
    tf.keras.layers.Input(shape=X.shape[-1]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    MultiChannelWeightedDropout(y.shape[-1], p=0.5),
])

In [15]:
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

## Cross Validation

In [16]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [17]:
fold_test = []
fold_pred = []

In [18]:
from torchmetrics.functional.classification import multilabel_accuracy

In [19]:
for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
    x_train, x_test, y_train, y_test = (X[train_idx]), (X[test_idx]), (y[train_idx]), (y[test_idx])
    
    train_init = time.time()
    model.fit(x_train, y_train, epochs=100, batch_size=32, verbose=0)
    train_time = time.time() - train_init

    test_init = time.time()
    prediction = model.predict(x_test)
    test_time = time.time() - test_init

    fold_test.append(y_test)
    fold_pred.append(prediction)

    preds = torch.tensor(prediction)
    trues = torch.tensor(y_test)

    print(f"Fold: {fold+1} | Accuracy: {multilabel_accuracy(preds=preds, target=trues, num_labels=5, average='micro').item():.5f} | Train Time: {train_time} | Test Time: {test_time}")  

Fold: 1 | Accuracy: 0.86452 | Train Time: 3.522820472717285 | Test Time: 0.11262083053588867
Fold: 2 | Accuracy: 0.89032 | Train Time: 2.78016996383667 | Test Time: 0.03856062889099121
Fold: 3 | Accuracy: 0.85806 | Train Time: 2.878084421157837 | Test Time: 0.03803873062133789
Fold: 4 | Accuracy: 0.88667 | Train Time: 2.9217123985290527 | Test Time: 0.03904533386230469
Fold: 5 | Accuracy: 0.88000 | Train Time: 2.9544618129730225 | Test Time: 0.037546396255493164
Fold: 6 | Accuracy: 0.92000 | Train Time: 2.910968542098999 | Test Time: 0.03957223892211914
Fold: 7 | Accuracy: 0.88667 | Train Time: 2.9549062252044678 | Test Time: 0.042440176010131836
Fold: 8 | Accuracy: 0.84000 | Train Time: 2.9668500423431396 | Test Time: 0.037046194076538086
Fold: 9 | Accuracy: 0.90000 | Train Time: 2.9199862480163574 | Test Time: 0.03899359703063965
Fold: 10 | Accuracy: 0.91333 | Train Time: 2.9519951343536377 | Test Time: 0.037302255630493164


In [20]:
from torchmetrics.functional.classification import multilabel_accuracy, multilabel_precision, multilabel_recall, multilabel_f1_score, multilabel_auroc

accuracy = []
precision = []
recall = []
f1_score = []
auroc = []

for i in range(10):
    trues = torch.tensor(fold_test[i])
    preds = torch.tensor(fold_pred[i])
    accuracy.append(multilabel_accuracy(preds=preds, target=trues, num_labels=5).item())
    precision.append(multilabel_precision(preds=preds, target=trues, num_labels=5).item())
    recall.append(multilabel_recall(preds=preds, target=trues, num_labels=5).item())
    f1_score.append(multilabel_f1_score(preds=preds, target=trues, num_labels=5).item())
    auroc.append(multilabel_auroc(preds.float(), trues.long(), num_labels=5).item())

print(f"Accuracy: {np.mean(accuracy):.5f} | Precision: {np.mean(precision):.5f} | Recall: {np.mean(recall):.5f} | F1 Score: {np.mean(f1_score):.5f} | AUC ROC: {np.mean(auroc):.5f}")

Accuracy: 0.88396 | Precision: 0.19687 | Recall: 0.18210 | F1 Score: 0.18347 | AUC ROC: 0.77974




In [21]:
import pickle

with open(f'../Results/heart_disease_pred.pkl', 'wb') as file:
    pickle.dump(fold_pred, file)
with open(f'../Results/heart_disease_true.pkl', 'wb') as file:
    pickle.dump(fold_test, file)