In [30]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)


In [31]:
df= pd.read_csv('CVD_cleaned.csv')

# Data Preprocessing

In [32]:
df2 = df.drop(['Fruit_Consumption','Green_Vegetables_Consumption','FriedPotato_Consumption','Checkup','Height_(cm)','Weight_(kg)','General_Health'], axis=1)

df2= pd.get_dummies(df2, columns=['Diabetes'],drop_first=True,dtype=np.int8)

age_mapping = {"18-24": 1, "25-29": 2, "30-34": 3, "35-39": 4, "40-44": 5,
               "45-49": 6, "50-54": 7, "55-59": 8, "60-64": 9, "65-69": 10, "70-74": 11, "75-79": 12, "80+": 13}

df2["Age_Category"] = df2["Age_Category"].map(age_mapping)

yes_no_columns = ["Exercise", "Heart_Disease", "Skin_Cancer", "Other_Cancer", 
                  "Depression", "Arthritis", "Smoking_History"]

df2[yes_no_columns] = df2[yes_no_columns].replace({"Yes": 1, "No": 0})

df2["Sex"] = df2["Sex"].map({"Male": 1, "Female": 0})

df2.rename(columns={"Diabetes_No, pre-diabetes or borderline diabetes": "Pre-diabetes","Diabetes_Yes":"Diabetes","Diabetes_Yes, but female told only during pregnancy":"Pregnancy-diabetes"},inplace=True)

df2.head()

  df2[yes_no_columns] = df2[yes_no_columns].replace({"Yes": 1, "No": 0})


Unnamed: 0,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Pre-diabetes,Diabetes,Pregnancy-diabetes
0,0,0,0,0,0,1,0,11,14.54,1,0.0,0,0,0
1,0,1,0,0,0,0,0,11,28.29,0,0.0,0,1,0
2,1,0,0,0,0,0,0,9,33.47,0,4.0,0,1,0
3,1,1,0,0,0,0,1,12,28.73,0,0.0,0,1,0
4,0,0,0,0,0,0,1,13,24.37,1,0.0,0,0,0


# Neural Networks

In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#scale continuous features
scaler = StandardScaler()
columns_to_scale = ['BMI', 'Alcohol_Consumption']
df2[columns_to_scale] = scaler.fit_transform(df2[columns_to_scale])


# Convert DataFrame to NumPy
X = df2.drop(columns=["Heart_Disease"]).values 
y = df2["Heart_Disease"].values  

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)  # Reshape for single output
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [42]:
class HeartDiseaseNN(nn.Module):
    def __init__(self, input_dim):
        super(HeartDiseaseNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer
        self.bn1 = nn.BatchNorm1d(128)  # Batch Normalization
        self.fc2 = nn.Linear(128, 64)  # Second hidden layer
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)  # Third hidden layer
        self.bn3 = nn.BatchNorm1d(32)
        self.fc4 = nn.Linear(32, 1)  # Output layer
        self.dropout = nn.Dropout(0.3)  # Dropout for regularization
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary classification

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.sigmoid(self.fc4(x))  # Sigmoid activation at output
        return x


input_dim = X_train.shape[1]
model = HeartDiseaseNN(input_dim)

# First Model with class imbalanced

In [44]:
criterion = nn.BCELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Move data to device
X_train_tensor, y_train_tensor = X_train_tensor.to(device), y_train_tensor.to(device)
X_test_tensor, y_test_tensor = X_test_tensor.to(device), y_test_tensor.to(device)

# Training Loop
num_epochs = 30
batch_size = 32

for epoch in range(num_epochs):
    model.train()  
    optimizer.zero_grad() 
    outputs = model(X_train_tensor)  # Forward pass
    loss = criterion(outputs, y_train_tensor)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    # Calculate Training Accuracy
    y_train_pred = (outputs > 0.5).float()
    train_accuracy = (y_train_pred.eq(y_train_tensor).sum().item()) / y_train_tensor.shape[0]

    # Evaluate on Test Set
    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            y_pred_weighted = (model(X_test_tensor) > 0.5).float().cpu().numpy()

        # Calculate recall for class 1
        recall = recall_score(y_test, y_pred_weighted, pos_label=1)
        
        # Calculate training accuracy
        y_train_pred = (outputs > 0.5).float()
        train_accuracy = (y_train_pred.eq(y_train_tensor).sum().item()) / y_train_tensor.shape[0]

        # Calculate test accuracy (no .cpu() needed on y_test)
        test_accuracy = (y_pred_weighted == y_test).mean()

        # Print metrics every 10 epochs
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {loss.item():.4f} | "
            f"Train Acc: {train_accuracy:.4f} | Test Acc: {test_accuracy:.4f} | "
            f"Test Recall (`1`): {recall:.4f}")

Epoch [10/50] - Loss: 0.4981 | Train Acc: 0.8681 | Test Acc: 0.7910 | Test Recall (`1`): 0.4317
Epoch [20/50] - Loss: 0.4550 | Train Acc: 0.8881 | Test Acc: 0.8202 | Test Recall (`1`): 0.3642
Epoch [30/50] - Loss: 0.4181 | Train Acc: 0.8981 | Test Acc: 0.8531 | Test Recall (`1`): 0.2680
Epoch [40/50] - Loss: 0.3861 | Train Acc: 0.9067 | Test Acc: 0.8868 | Test Recall (`1`): 0.1541
Epoch [50/50] - Loss: 0.3600 | Train Acc: 0.9135 | Test Acc: 0.9118 | Test Recall (`1`): 0.0382


In [45]:
model.eval()

with torch.no_grad():
    y_test_pred_prob = model(X_test_tensor).cpu().numpy().flatten()  # Convert to NumPy and flatten

# Convert actual y_test tensor to NumPy
y_test_actual = y_test_tensor.cpu().numpy().flatten()  # Ensure it's a NumPy array

# Convert to a DataFrame
df_test_results = pd.DataFrame(X_test, columns=df2.drop(columns=["Heart_Disease"]).columns)  # Create DataFrame from X_test features
df_test_results["Heart_Disease_Actual"] = y_test_actual  # Add actual labels
df_test_results["Heart_Disease_Risk"] = y_test_pred_prob
df_test_results.tail(10)

Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Pre-diabetes,Diabetes,Pregnancy-diabetes,Heart_Disease_Actual,Heart_Disease_Risk
61761,0.0,0.0,0.0,0.0,0.0,1.0,8.0,-0.17267,0.0,-0.621527,0.0,0.0,0.0,0.0,0.215577
61762,1.0,0.0,0.0,0.0,0.0,1.0,10.0,-0.437914,0.0,-0.621527,0.0,0.0,0.0,0.0,0.225521
61763,1.0,0.0,0.0,0.0,1.0,1.0,8.0,0.377748,1.0,0.841932,0.0,1.0,0.0,0.0,0.359739
61764,1.0,0.0,0.0,0.0,0.0,1.0,4.0,-0.186469,0.0,-0.621527,0.0,0.0,0.0,0.0,0.218419
61765,1.0,0.0,0.0,1.0,1.0,0.0,11.0,2.0612,0.0,-0.621527,0.0,0.0,0.0,0.0,0.381057
61766,0.0,0.0,0.0,0.0,1.0,0.0,9.0,1.500049,0.0,-0.621527,0.0,0.0,0.0,0.0,0.23499
61767,0.0,0.0,0.0,1.0,0.0,0.0,13.0,1.15048,0.0,-0.499572,0.0,0.0,0.0,0.0,0.36675
61768,0.0,0.0,0.0,1.0,1.0,0.0,10.0,4.003762,1.0,-0.621527,0.0,1.0,0.0,1.0,0.504065
61769,1.0,0.0,0.0,0.0,1.0,1.0,4.0,0.304154,1.0,0.841932,0.0,0.0,0.0,0.0,0.231453
61770,1.0,0.0,0.0,0.0,0.0,0.0,10.0,-0.178803,0.0,-0.621527,0.0,0.0,0.0,0.0,0.229093


In [47]:
print("Performance without Class Weights in Loss Function:")
print(classification_report(y_test, y_test_pred_class))


Performance without Class Weights in Loss Function:
              precision    recall  f1-score   support

           0       0.95      0.86      0.90     56774
           1       0.22      0.44      0.29      4997

    accuracy                           0.83     61771
   macro avg       0.58      0.65      0.60     61771
weighted avg       0.89      0.83      0.85     61771



In [48]:
from torch.nn import BCELoss
from sklearn.metrics import classification_report, recall_score
import torch

# Calculate class weights based on the training set distribution
class_counts = pd.Series(y_train).value_counts()
weight_0 = class_counts[1] / class_counts.sum()  # Weight for 'No Disease' class (0)
weight_1 = class_counts[0] / class_counts.sum()  # Weight for 'Heart Disease' class (1)

# Create class weights tensor
class_weights = torch.tensor([weight_0, weight_1]).to(device)

# Use weighted loss function
criterion_weighted = BCELoss(weight=class_weights[y_train_tensor.long()])

# Re-train the model with weighted loss, focusing on recall
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass and loss calculation
    outputs = model(X_train_tensor)
    loss = criterion_weighted(outputs, y_train_tensor)
    
    # Backpropagation and optimization
    loss.backward()
    optimizer.step()

    # Evaluate on the test set every 10 epochs
    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            y_pred_weighted = (model(X_test_tensor) > 0.5).float().cpu().numpy()

        # Calculate recall for class 1
        recall = recall_score(y_test, y_pred_weighted, pos_label=1)
        
        # Calculate training accuracy
        y_train_pred = (outputs > 0.5).float()
        train_accuracy = (y_train_pred.eq(y_train_tensor).sum().item()) / y_train_tensor.shape[0]

        # Calculate test accuracy (no .cpu() needed on y_test)
        test_accuracy = (y_pred_weighted == y_test).mean()

        # Print metrics every 10 epochs
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {loss.item():.4f} | "
            f"Train Acc: {train_accuracy:.4f} | Test Acc: {test_accuracy:.4f} | "
            f"Test Recall (`1`): {recall:.4f}")



Epoch [10/50] - Loss: 0.0962 | Train Acc: 0.8829 | Test Acc: 0.8079 | Test Recall (`1`): 0.4263
Epoch [20/50] - Loss: 0.0896 | Train Acc: 0.8214 | Test Acc: 0.7915 | Test Recall (`1`): 0.4765
Epoch [30/50] - Loss: 0.0862 | Train Acc: 0.7908 | Test Acc: 0.7696 | Test Recall (`1`): 0.5327
Epoch [40/50] - Loss: 0.0845 | Train Acc: 0.7697 | Test Acc: 0.7228 | Test Recall (`1`): 0.6354
Epoch [50/50] - Loss: 0.0832 | Train Acc: 0.7581 | Test Acc: 0.6821 | Test Recall (`1`): 0.7090


In [49]:
model.eval()

with torch.no_grad():
    y_test_pred_prob = model(X_test_tensor).cpu().numpy().flatten()  # Convert to NumPy and flatten

# Convert actual y_test tensor to NumPy
y_test_actual = y_test_tensor.cpu().numpy().flatten()  # Ensure it's a NumPy array

# Convert to a DataFrame
df_test_results = pd.DataFrame(X_test, columns=df2.drop(columns=["Heart_Disease"]).columns)  # Create DataFrame from X_test features
df_test_results["Heart_Disease_Actual"] = y_test_actual  # Add actual labels
df_test_results["Heart_Disease_Risk"] = y_test_pred_prob
df_test_results.tail(10)

Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Pre-diabetes,Diabetes,Pregnancy-diabetes,Heart_Disease_Actual,Heart_Disease_Risk
61761,0.0,0.0,0.0,0.0,0.0,1.0,8.0,-0.17267,0.0,-0.621527,0.0,0.0,0.0,0.0,0.314088
61762,1.0,0.0,0.0,0.0,0.0,1.0,10.0,-0.437914,0.0,-0.621527,0.0,0.0,0.0,0.0,0.37199
61763,1.0,0.0,0.0,0.0,1.0,1.0,8.0,0.377748,1.0,0.841932,0.0,1.0,0.0,0.0,0.585221
61764,1.0,0.0,0.0,0.0,0.0,1.0,4.0,-0.186469,0.0,-0.621527,0.0,0.0,0.0,0.0,0.058686
61765,1.0,0.0,0.0,1.0,1.0,0.0,11.0,2.0612,0.0,-0.621527,0.0,0.0,0.0,0.0,0.533614
61766,0.0,0.0,0.0,0.0,1.0,0.0,9.0,1.500049,0.0,-0.621527,0.0,0.0,0.0,0.0,0.357253
61767,0.0,0.0,0.0,1.0,0.0,0.0,13.0,1.15048,0.0,-0.499572,0.0,0.0,0.0,0.0,0.576921
61768,0.0,0.0,0.0,1.0,1.0,0.0,10.0,4.003762,1.0,-0.621527,0.0,1.0,0.0,1.0,0.710865
61769,1.0,0.0,0.0,0.0,1.0,1.0,4.0,0.304154,1.0,0.841932,0.0,0.0,0.0,0.0,0.144637
61770,1.0,0.0,0.0,0.0,0.0,0.0,10.0,-0.178803,0.0,-0.621527,0.0,0.0,0.0,0.0,0.277016


In [52]:
print("Performance with Class Weights in Loss Function:")
print(classification_report(y_test, y_pred_weighted))

Performance with Class Weights in Loss Function:
              precision    recall  f1-score   support

           0       0.97      0.75      0.85     56774
           1       0.20      0.71      0.32      4997

    accuracy                           0.75     61771
   macro avg       0.59      0.73      0.58     61771
weighted avg       0.91      0.75      0.80     61771



In [53]:
torch.save(model.state_dict(), "heart_disease_model.pth")