In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import BCELoss

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, recall_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)


In [2]:
df= pd.read_csv('CVD_cleaned.csv')

# Data Preprocessing

In [3]:
df2 = df.drop(['Fruit_Consumption','Green_Vegetables_Consumption','FriedPotato_Consumption','Checkup','Height_(cm)','Weight_(kg)','General_Health'], axis=1)

df2= pd.get_dummies(df2, columns=['Diabetes'],drop_first=True,dtype=np.int8)

age_mapping = {"18-24": 1, "25-29": 2, "30-34": 3, "35-39": 4, "40-44": 5,
               "45-49": 6, "50-54": 7, "55-59": 8, "60-64": 9, "65-69": 10, "70-74": 11, "75-79": 12, "80+": 13}

df2["Age_Category"] = df2["Age_Category"].map(age_mapping)


yes_no_columns = ["Exercise", "Heart_Disease", "Skin_Cancer", "Other_Cancer", 
                  "Depression", "Arthritis", "Smoking_History"]

df2[yes_no_columns] = df2[yes_no_columns].replace({"Yes": 1, "No": 0})

df2["Sex"] = df2["Sex"].map({"Male": 1, "Female": 0})

df2.rename(columns={"Diabetes_No, pre-diabetes or borderline diabetes": "Pre-diabetes","Diabetes_Yes":"Diabetes","Diabetes_Yes, but female told only during pregnancy":"Pregnancy-diabetes"},inplace=True)

#scale continuous features
scaler = StandardScaler()
columns_to_scale = ['BMI', 'Alcohol_Consumption']
df2[columns_to_scale] = scaler.fit_transform(df2[columns_to_scale])

df2.head()

  df2[yes_no_columns] = df2[yes_no_columns].replace({"Yes": 1, "No": 0})


Unnamed: 0,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Pre-diabetes,Diabetes,Pregnancy-diabetes
0,0,0,0,0,0,1,0,11,-2.159696,1,-0.621527,0,0,0
1,0,1,0,0,0,0,0,11,-0.051548,0,-0.621527,0,1,0
2,1,0,0,0,0,0,0,9,0.742649,0,-0.133707,0,1,0
3,1,1,0,0,0,0,1,12,0.015913,0,-0.621527,0,1,0
4,0,0,0,0,0,0,1,13,-0.652562,1,-0.621527,0,0,0


In [4]:
scaler

# Neural Networks

In [5]:
# Convert DataFrame to NumPy
X = df2.drop(columns=["Heart_Disease"]).values 
y = df2["Heart_Disease"].values  

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1) 
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [39]:
class HeartDiseaseNN(nn.Module):
    def __init__(self, input_dim):
        super(HeartDiseaseNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer
        self.bn1 = nn.BatchNorm1d(128)  
        self.fc2 = nn.Linear(128, 64)  # Second hidden layer
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)  # Third hidden layer
        self.bn3 = nn.BatchNorm1d(32)
        self.fc4 = nn.Linear(32, 1)  # Output layer
        self.dropout = nn.Dropout(0.3)  # Dropout for regularization
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary classification

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.sigmoid(self.fc4(x))  # Sigmoid activation at output
        return x


input_dim = X_train.shape[1]
model = HeartDiseaseNN(input_dim)

# Model including class weights

In [None]:
# Calculate class weights based on the training set distribution
class_counts = pd.Series(y_train).value_counts()
weight_0 = class_counts[1] / class_counts.sum()  
weight_1 = class_counts[0] / class_counts.sum() 


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.Adam(model.parameters(), lr=0.001)
class_weights = torch.tensor([weight_0, weight_1]).to(device)

# Use weighted loss function
criterion_weighted = BCELoss(weight=class_weights[y_train_tensor.long()])

# Re-train the model with weighted loss, focusing on recall
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass and loss calculation
    outputs = model(X_train_tensor)
    loss = criterion_weighted(outputs, y_train_tensor)
    
    # Backpropagation and optimization
    loss.backward()
    optimizer.step()


    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            y_pred_weighted = (model(X_test_tensor) > 0.5).float().cpu().numpy()

        # Calculate recall for class 1
        recall = recall_score(y_test, y_pred_weighted, pos_label=1)
        
        # Calculate accuracy
        y_train_pred = (outputs > 0.7).float()
        train_accuracy = (y_train_pred.eq(y_train_tensor).sum().item()) / y_train_tensor.shape[0]
        test_accuracy = (y_pred_weighted.flatten() == y_test.flatten()).mean()


        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {loss.item():.4f} | "
            f"Train Acc: {train_accuracy:.4f} | Test Acc: {test_accuracy:.4f} | "
            f"Test Recall (`1`): {recall:.4f}")


Epoch [10/100] - Loss: 0.0899 | Train Acc: 0.8066 | Test Acc: 0.4016 | Test Recall (`1`): 0.9594
Epoch [20/100] - Loss: 0.0858 | Train Acc: 0.8076 | Test Acc: 0.5430 | Test Recall (`1`): 0.9015
Epoch [30/100] - Loss: 0.0833 | Train Acc: 0.8329 | Test Acc: 0.6345 | Test Recall (`1`): 0.8467
Epoch [40/100] - Loss: 0.0817 | Train Acc: 0.8435 | Test Acc: 0.6708 | Test Recall (`1`): 0.8111
Epoch [50/100] - Loss: 0.0808 | Train Acc: 0.8474 | Test Acc: 0.6847 | Test Recall (`1`): 0.7977
Epoch [60/100] - Loss: 0.0802 | Train Acc: 0.8496 | Test Acc: 0.6885 | Test Recall (`1`): 0.7987
Epoch [70/100] - Loss: 0.0799 | Train Acc: 0.8507 | Test Acc: 0.6834 | Test Recall (`1`): 0.8069
Epoch [80/100] - Loss: 0.0796 | Train Acc: 0.8508 | Test Acc: 0.6755 | Test Recall (`1`): 0.8163
Epoch [90/100] - Loss: 0.0795 | Train Acc: 0.8510 | Test Acc: 0.6726 | Test Recall (`1`): 0.8219
Epoch [100/100] - Loss: 0.0793 | Train Acc: 0.8508 | Test Acc: 0.6710 | Test Recall (`1`): 0.8243


In [42]:
model.eval()

with torch.no_grad():
    y_test_pred_prob = model(X_test_tensor).cpu().numpy().flatten()  # Convert to NumPy and flatten 

# Convert actual y_test tensor to NumPy
y_test_actual = y_test_tensor.cpu().numpy().flatten()  # Ensure it's a NumPy array

# Convert to a DataFrame
df_test_results = pd.DataFrame(X_test, columns=df2.drop(columns=["Heart_Disease"]).columns)  # Create DataFrame from X_test features
df_test_results["Heart_Disease_Actual"] = y_test_actual  # Add actual labels
df_test_results["Heart_Disease_Risk"] = y_test_pred_prob
df_test_results.tail(20)

Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Pre-diabetes,Diabetes,Pregnancy-diabetes,Heart_Disease_Actual,Heart_Disease_Risk
61751,1.0,0.0,0.0,0.0,0.0,0.0,3.0,-0.241664,1.0,0.232158,0.0,0.0,0.0,0.0,0.063749
61752,1.0,0.0,0.0,0.0,0.0,1.0,13.0,-0.043882,1.0,-0.377617,0.0,0.0,0.0,0.0,0.764231
61753,1.0,0.0,0.0,0.0,0.0,0.0,11.0,-1.155451,0.0,-0.621527,0.0,0.0,0.0,0.0,0.226748
61754,0.0,0.0,0.0,0.0,1.0,0.0,11.0,3.137505,1.0,-0.621527,0.0,1.0,0.0,0.0,0.808394
61755,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.216762,0.0,0.598022,0.0,0.0,0.0,0.0,0.046127
61756,1.0,0.0,0.0,0.0,0.0,0.0,9.0,1.179611,1.0,0.841932,0.0,0.0,0.0,0.0,0.313165
61757,0.0,0.0,0.0,1.0,0.0,0.0,11.0,-1.106388,0.0,-0.621527,0.0,0.0,0.0,0.0,0.553419
61758,1.0,0.0,0.0,0.0,0.0,0.0,9.0,-0.473177,0.0,-0.621527,0.0,0.0,0.0,0.0,0.167785
61759,0.0,0.0,0.0,1.0,1.0,0.0,10.0,2.645348,0.0,-0.621527,1.0,0.0,0.0,0.0,0.647902
61760,1.0,0.0,0.0,0.0,0.0,1.0,4.0,0.46514,0.0,-0.621527,0.0,0.0,0.0,0.0,0.092523


In [43]:
print("Performance with Class Weights in Loss Function:")
print(classification_report(y_test, y_pred_weighted))

Performance with Class Weights in Loss Function:
              precision    recall  f1-score   support

           0       0.98      0.66      0.79     56774
           1       0.17      0.82      0.29      4997

    accuracy                           0.67     61771
   macro avg       0.58      0.74      0.54     61771
weighted avg       0.91      0.67      0.75     61771



In [44]:
torch.save(model.state_dict(), "heart_disease_model.pth")

# Inference

In [41]:
new_patients = np.array([
    [1,0,0,0,0,1,2,30,0,5,0,0,0],
   [0,0,0,0,0,1,2,30,0,5,0,0,0],
])

# Standardize using the same scaler
columns_to_scale = [7, 9]
new_patients[:, columns_to_scale] = scaler.transform(new_patients[:, columns_to_scale])


# Convert to PyTorch tensor
new_patients_tensor = torch.tensor(new_patients, dtype=torch.float32).to(device)

# with torch.no_grad():
#     logits = model(new_patients_tensor)  # Raw logits
#     risk_scores = torch.sigmoid(logits).detach().cpu().numpy().flatten()

# Get predictions
with torch.no_grad():
    risk_scores = model(new_patients_tensor).cpu().numpy().flatten()  # Convert tensor to NumPy

# Convert probabilities to binary predictions
predictions = (risk_scores > 0.5).astype(int)

# Create results DataFrame
df_results = pd.DataFrame(new_patients, columns=df2.drop(columns='Heart_Disease').columns)
df_results["Heart_Disease_Risk"] = risk_scores
df_results["Heart_Disease_Prediction"] = predictions
df_results



Unnamed: 0,Exercise,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Pre-diabetes,Diabetes,Pregnancy-diabetes,Heart_Disease_Risk,Heart_Disease_Prediction
0,1,0,0,0,0,1,2,0,0,0,0,0,0,0.05293,0
1,0,0,0,0,0,1,2,0,0,0,0,0,0,0.0624,0


# Model with BCEWithLogitsLoss

In [None]:
# Calculate class weights based on the training set distribution
class_counts = pd.Series(y_train).value_counts()
weight_0 = class_counts[1] / class_counts.sum()  # Weight for 'No Disease' class (0)
weight_1 = class_counts[0] / class_counts.sum()  # Weight for 'Heart Disease' class (1)

# Create class weights tensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.Adam(model.parameters(), lr=0.001)
class_weights = torch.tensor([weight_0, weight_1]).to(device)

# Use weighted loss function
# criterion_weighted = BCELoss(weight=class_weights[y_train_tensor.long()])

pos_weight = torch.tensor([weight_1 / weight_0]).to(device)  # Weight for the positive class
criterion_weighted = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Re-train the model with weighted loss, focusing on recall
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass and loss calculation (outputs are raw logits)
    outputs = model(X_train_tensor)
    loss = criterion_weighted(outputs, y_train_tensor)
    
    # Backpropagation and optimization
    loss.backward()
    optimizer.step()

    # Evaluate on the test set every 10 epochs
    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            # Get raw logits and apply Sigmoid manually for probabilities
            y_test_logits = model(X_test_tensor)
            y_test_probs = torch.sigmoid(y_test_logits).detach().cpu().numpy().flatten()

            # Apply threshold to get binary predictions
            y_pred_weighted = (y_test_probs > 0.5).astype(float)

        # Calculate recall for class 1
        recall = recall_score(y_test, y_pred_weighted, pos_label=1)
        
        # Calculate training accuracy using probabilities
        y_train_probs = torch.sigmoid(outputs).detach().cpu().numpy().flatten()
        y_train_pred = (y_train_probs > 0.5).astype(float)
        y_train_true = y_train_tensor.detach().cpu().numpy().flatten()
        train_accuracy = (y_train_pred == y_train_true).mean()

        # Calculate test accuracy
        test_accuracy = (y_pred_weighted == y_test).mean()

        # Print metrics every 10 epochs
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {loss.item():.4f} | "
              f"Train Acc: {train_accuracy:.4f} | Test Acc: {test_accuracy:.4f} | "
              f"Test Recall (`1`): {recall:.4f}")
