Kaggle Dataset Diabetes, Hypertensionand Stroke Prediction:https://www.kaggle.com/datasets/prosperchuks/health-dataset/data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))


Used to shuffle the dataset so that the predictions are not unevely measured.

In [None]:
#Change these paths as needed
data_path = "diabetes_data.csv"
shuffled_data_path =  "shuffled_diabetes_data.csv"

In [None]:
# Load the dataset
df = pd.read_csv(data_path)

#Shuffle the dataframe
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

#Save the shuffled dataframe to a new CSV file
df_shuffled.to_csv(shuffled_data_path, index=False)

# Phase 1

In [None]:

# Display dataset shape and first few rows
print("Dataset shape:", df_shuffled.shape)
print(df_shuffled.head())

# Check for null values
print("\nMissing values:")
print(df_shuffled.isnull().sum())

# Target column: Diabetes (0 or 1)
# Separate features and target
X = df_shuffled.drop(columns=['Diabetes'])
y = df_shuffled['Diabetes']

# Normalize numeric input features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split into training (90%) and validation (10%) sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.1, random_state=42, stratify=y)

# Print resulting shapes
print("\nTraining set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)

# Plot feature distribution examples
plt.figure(figsize=(12, 6))
sns.histplot(X['BMI'], kde=True, bins=30)
plt.title('BMI Distribution')
plt.xlabel('BMI')
plt.ylabel('Count')
plt.show()

sns.countplot(x=y)
plt.title('Target Label Distribution (Diabetes)')
plt.xlabel('Diabetes (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()


# Phase 2

Overfitting the data

In [None]:
# Store results for Phase 2
overfitting_results = []



class EarlyStoppingByAccuracy(Callback):
    def __init__(self, monitor='accuracy', value=0.99):
        super().__init__()
        self.monitor = monitor
        self.value = value

    def on_epoch_end(self, epoch, logs=None):
        if logs.get(self.monitor) >= self.value:
            print(f"Reached {self.value*100:.1f}% training accuracy. Stopping training.")
            self.model.stop_training = True


In [None]:
def build_and_train_overfit_model(hidden_layers, neurons_per_layer, input_dim, epochs=1000, batch_size=32):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    for _ in range(hidden_layers):
        model.add(Dense(neurons_per_layer, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=0.0005),
                  loss=BinaryCrossentropy(),
                  metrics=['accuracy'])

    early_stop = EarlyStoppingByAccuracy(monitor='accuracy', value=0.99)

    start_time = time.time()
    history = model.fit(X_scaled, y, epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[early_stop])
    end_time = time.time()

    final_acc = history.history['accuracy'][-1]
    elapsed_time = end_time - start_time
    
    print(f"Training time: {elapsed_time:.2f} seconds")
    overfitting_results.append((f"{neurons_per_layer}-neurons x {hidden_layers}-layers", final_acc))

    return model, history


In [None]:
# Testing model sizes
layer_sizes = [5, 6, 7]
neuron_options = [512, 1024]

for layers in layer_sizes:
    for neurons in neuron_options:
        print(f"Training model: {neurons} neurons x {layers} layers")
        model, history = build_and_train_overfit_model(hidden_layers=layers, neurons_per_layer=neurons, input_dim=X_scaled.shape[1])
        if history.history['accuracy'][-1] >= 0.99:
            break

In [None]:
# Show Overfitting Results
print("\nOverfitting Results:")
for config, acc, elapsed_time in overfitting_results:
    print(f"{config} => Training Accuracy: {acc:.4f}, Time: {elapsed_time:.2f} seconds")

In [None]:
# Plot accuracy vs model size
labels = [config for config, acc in overfitting_results]
accuracies = [acc for config, acc in overfitting_results]

plt.figure(figsize=(10, 5))
plt.plot(labels, accuracies, marker='o')
plt.xticks(rotation=90)
plt.ylabel("Training Accuracy")
plt.title("Training Accuracy vs. Model Architecture")
plt.grid(True)
plt.tight_layout()
plt.show()

# Phase 3: Model Selection and Evaluation

In [None]:
# Store evaluation results
evaluation_results = []

# Wrapper function to build and train model for Phase 3
#This function is built from the same function in Phase 2
def train_and_evaluate_model_phase3(hidden_layers, neurons_per_layer, input_dim, model_name):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    for _ in range(hidden_layers):
        model.add(Dense(neurons_per_layer, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(), loss=BinaryCrossentropy(), metrics=['accuracy'])

    checkpoint = ModelCheckpoint(f'{model_name}_best.keras', monitor='val_loss', save_best_only=True, mode='min')
    model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[checkpoint], verbose=0)

    preds = np.round(model.predict(X_val)).flatten()
    acc = accuracy_score(y_val, preds)
    precision = precision_score(y_val, preds)
    recall = recall_score(y_val, preds)
    f1 = f1_score(y_val, preds)
    params = model.count_params()
    evaluation_results.append((model_name, acc, precision, recall, f1, params))

In [None]:
# 1. Random Baseline Classifier
random_preds = np.random.choice([0, 1], size=y_val.shape)
random_acc = accuracy_score(y_val, random_preds)
random_precision = precision_score(y_val, random_preds)
random_recall = recall_score(y_val, random_preds)
random_f1 = f1_score(y_val, random_preds)
evaluation_results.append(("Random Baseline", random_acc, random_precision, random_recall, random_f1, "-"))

In [None]:
# 2. Logistic Regression (no hidden layers)
train_and_evaluate_model_phase3(hidden_layers=0, neurons_per_layer=1, input_dim=X_train.shape[1], model_name="Logistic Regression")


In [None]:
# 3. Neural Network Architectures to Evaluate
architectures = {
    "64-32-16-8-1": [64, 32, 16, 8],
    "32-16-8-1": [32, 16, 8],
    "16-8-1": [16, 8],
    "8-1": [8],
    "4-1": [4],
    "2-1": [2]
}

for name, layers in architectures.items():
    hidden_layers = len(layers)
    neurons_per_layer = layers[0]  # We pass the size of the first hidden layer
    train_and_evaluate_model_phase3(hidden_layers, neurons_per_layer, X_train.shape[1], model_name=name)

In [None]:
# Summarize Results
results_df = pd.DataFrame(evaluation_results, columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "# Parameters"])
print(results_df)

In [None]:
# Identify Best Model
best_model = results_df.sort_values(by="Accuracy", ascending=False).iloc[0]
print("\nBest Performing Model:")
print(best_model)

In [None]:
# Plot Accuracy Comparison
plt.figure(figsize=(10, 6))
plt.bar(results_df['Model'], results_df['Accuracy'])
plt.xticks(rotation=45)
plt.title('Validation Accuracy by Model')
plt.ylabel('Accuracy')
plt.grid(True)
plt.tight_layout()
plt.show()