In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
#LOAD THE DATASET
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
path = '/content/drive/MyDrive/DiabetesPrediction/diabetes.csv'

df = pd.read_csv(path)

#PRINT FIRST 5 ROWS OF THE DATASET
print(df.head())

In [None]:
#PRINT THE INFO OF THE DATASET
print(df.info())

In [None]:
#PRINT THE SUMMARY STATISTICS OF THE DATASET
print(df.describe())

In [None]:
#HANDLING MISSING OR ZERO VALUES

#REPLACE ZEROS IN BIOLOGICALLY IMPLAUSIBLE COLUMNS
zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

#REPLACE 0s WITH NaN AND FILL WITH MEDIAN
for col in zero_columns:
  df[col] = df[col].replace(0, np.nan)
  median = df[col].median()
  df[col] = df[col].fillna(median)




In [None]:
#VISUALIZATIONS

#CLASS DISTRIBUTION
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Outcome')
plt.title('DIabetes Class Distribution (0 = No, 1 = Yes)')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()

In [None]:
#FEATURE DISTRIBUTIONS
df.hist(figsize=(12,10), bins=20)
plt.suptitle("Feature Distributions")
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
#CORRELATION HEATMAP
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
#FEATURE SCALING #STANDARDIZATION
X = df.drop('Outcome', axis=1)
y = df['Outcome']

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Standardization is generally better for structured/tabular data like this one (Pima Indian Diabetes).

Normalization (0–1) is more common with image data or bounded inputs (e.g., pixel intensities, audio samples).

We applied standardization to the features using StandardScaler so that each feature has a mean of 0 and a standard deviation of 1. This helps improve model convergence and ensures that the network treats all features on a comparable scale

In [None]:
#DATA SPLITTING

#FIRST SPLIT THE DATA INTO TRAINING(70%) AND TEMP (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.30, stratify=y,  random_state=42)

#THEN SPLIT THE TEMP DATA INTO VALIDATION(20%) AND TEST(10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42)

print("\nDataset Split Summary:")
print(f"Training Set: {X_train.shape}, {y_train.shape}")
print(f"Validation Set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")


In [None]:
#MODEL DEVELOPMENT

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping


#Define the Model
model = Sequential([
    Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

#Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Early Stopping to Prevent Overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

#Train the Model
history = model.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs=100,
                    batch_size=32,
                    callbacks=[early_stop],
                    verbose=1)

base_val_loss_index = np.argmin(history.history['val_loss'])
base_val_acc =history.history['val_accuracy'][base_val_loss_index]

print(f"Base Model Early-Stopped Validation Accuracy: {base_val_acc:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Plot Accuracy
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Loss Over Epochs')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Grid Search Architectures
network_configs = [
    [32, 16],
    [64, 32],
    [16, 8],
    [32, 16, 8],
    [64,32,16,8],
    [64, 32, 16],
    [128, 64, 32],
    [32, 32, 16]
]

val_accuracies = [base_val_acc]  # start with base model accuracy
labels = ['Base Model'] + [str(cfg) for cfg in network_configs]

# Track best model automatically
best_val_acc = base_val_acc
best_config = 'Base Model'

for config in network_configs:
    model = Sequential()
    model.add(Dense(config[0], activation='relu', input_shape=(X_train.shape[1],)))
    for units in config[1:]:
        model.add(Dense(units, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=100,
                        batch_size=32,
                        callbacks=[early_stop],
                        verbose=0)

    # Get validation accuracy at best validation loss epoch
    best_epoch = np.argmin(history.history['val_loss'])
    val_acc = history.history['val_accuracy'][best_epoch]
    val_accuracies.append(val_acc)

    # Auto-track best architecture
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_config = config

# Visualization
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(range(len(val_accuracies)), val_accuracies, marker='o', linestyle='-', color='blue')
plt.xticks(ticks=range(len(val_accuracies)), labels=labels, rotation=45)
plt.xlabel('Network Configuration')
plt.ylabel('Validation Accuracy at Early Stop')
plt.title('Grid Search vs Base Model (Early-Stopped Accuracy)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Print best architecture selected
print(f"\n🏆 Best Architecture Selected Automatically: {best_config}")
print(f"✅ Best Validation Accuracy: {best_val_acc:.4f}")


In [None]:
# Identify the best model based on validation accuracy
best_index = np.argmax(val_accuracies)
best_config = ['Base Model'] + network_configs
best_config = best_config[best_index]
best_accuracy = val_accuracies[best_index]

print(f"🏆 Best Architecture: {best_config}")
print(f"✅ Validation Accuracy: {best_accuracy:.4f}")

best_architecture = best_config


In [None]:
#L2 REGULARIZATION

from tensorflow.keras.regularizers import l2
# Set L2 regularization factor (lambda)
l2_lambda = 0.01  # You can experiment with 0.001 or 0.0001 too

# Build model with L2 regularization
l2_model = Sequential()
l2_model.add(Dense(best_architecture[0], activation='relu', input_shape=(X_train.shape[1],),
                   kernel_regularizer=l2(l2_lambda)))
for units in best_architecture[1:]:
    l2_model.add(Dense(units, activation='relu', kernel_regularizer=l2(l2_lambda)))
l2_model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(l2_lambda)))

# Compile
l2_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping
early_stop_l2 = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
l2_history = l2_model.fit(X_train, y_train,
                          validation_data=(X_val, y_val),
                          epochs=100,
                          batch_size=32,
                          callbacks=[early_stop_l2],
                          verbose=1)

# Get validation accuracy at the best epoch
best_epoch_l2 = np.argmin(l2_history.history['val_loss'])
best_val_acc_l2 = l2_history.history['val_accuracy'][best_epoch_l2]
print(f"\n✅ L2-Regularized Model Early-Stopped Validation Accuracy: {best_val_acc_l2:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Get training history
loss = l2_history.history['loss']
val_loss = l2_history.history['val_loss']
acc = l2_history.history['accuracy']
val_acc = l2_history.history['val_accuracy']
epochs = range(1, len(loss) + 1)

# Plot Loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(epochs, loss, label='Training Loss', color='blue')
plt.plot(epochs, val_loss, label='Validation Loss', color='orange')
plt.axvline(x=best_epoch_l2 + 1, color='red', linestyle='--', label='Early Stop Epoch')
plt.title('Loss with L2 Regularization')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, acc, label='Training Accuracy', color='blue')
plt.plot(epochs, val_acc, label='Validation Accuracy', color='orange')
plt.axvline(x=best_epoch_l2 + 1, color='red', linestyle='--', label='Early Stop Epoch')
plt.title('Accuracy with L2 Regularization')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get epoch counts
epochs_best = range(1, len(history.history['loss']) + 1)
epochs_l2 = range(1, len(l2_history.history['loss']) + 1)

# Plot Loss Comparison
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_best, history.history['val_loss'], label='Best Model Val Loss', color='blue', linestyle='-')
plt.plot(epochs_best, history.history['loss'], label='Best Model Train Loss', color='blue', linestyle='--')
plt.plot(epochs_l2, l2_history.history['val_loss'], label='L2 Model Val Loss', color='orange', linestyle='-')
plt.plot(epochs_l2, l2_history.history['loss'], label='L2 Model Train Loss', color='orange', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Validation and Training Loss')
plt.legend()
plt.grid(True)

# Plot Accuracy Comparison
plt.subplot(1, 2, 2)
plt.plot(epochs_best, history.history['val_accuracy'], label='Best Model Val Accuracy', color='blue', linestyle='-')
plt.plot(epochs_best, history.history['accuracy'], label='Best Model Train Accuracy', color='blue', linestyle='--')
plt.plot(epochs_l2, l2_history.history['val_accuracy'], label='L2 Model Val Accuracy', color='orange', linestyle='-')
plt.plot(epochs_l2, l2_history.history['accuracy'], label='L2 Model Train Accuracy', color='orange', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Validation and Training Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
#ADDING DROPOUT
from tensorflow.keras.layers import Dropout

# Dropout rates to try
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5]
dropout_accuracies = []
dropout_histories = []

for rate in dropout_rates:
    model = Sequential()
    model.add(Dense(best_architecture[0], activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate))

    for units in best_architecture[1:]:
        model.add(Dense(units, activation='relu'))
        model.add(Dropout(rate))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        epochs=100,
                        batch_size=32,
                        callbacks=[early_stop],
                        verbose=0)

    best_epoch = np.argmin(history.history['val_loss'])
    val_acc = history.history['val_accuracy'][best_epoch]

    dropout_accuracies.append(val_acc)
    dropout_histories.append(history)

# Find the best dropout rate
best_idx = np.argmax(dropout_accuracies)
best_dropout_rate = dropout_rates[best_idx]
best_val_acc = dropout_accuracies[best_idx]

# Plot the results
plt.figure(figsize=(8, 5))
plt.plot(dropout_rates, dropout_accuracies, marker='o', linestyle='-', color='purple')
plt.xlabel('Dropout Rate')
plt.ylabel('Validation Accuracy')
plt.title('Dropout Rate vs Validation Accuracy (Early-Stopped)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Print best result
print(f"🏆 Best Dropout Rate: {best_dropout_rate}")
print(f"✅ Best Validation Accuracy with Dropout: {best_val_acc:.4f}")

In [None]:
# 1. Combine training and validation sets for final training
X_final_train = np.concatenate((X_train, X_val), axis=0)
y_final_train = np.concatenate((y_train, y_val), axis=0)


# 2. Build the final model with best architecture and best dropout rate
final_model = Sequential()
final_model.add(Dense(best_architecture[0], activation='relu', input_shape=(X_final_train.shape[1],)))
final_model.add(Dropout(best_dropout_rate))

# 2. Build the final model with best architecture and best dropout rate
final_model = Sequential()
final_model.add(Dense(best_architecture[0], activation='relu', input_shape=(X_final_train.shape[1],)))
final_model.add(Dropout(best_dropout_rate))

for units in best_architecture[1:]:
    final_model.add(Dense(units, activation='relu'))
    final_model.add(Dropout(best_dropout_rate))

final_model.add(Dense(1, activation='sigmoid'))

# 3. Compile the model
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 4. Train on combined train + val set with early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = final_model.fit(X_final_train, y_final_train,
                          validation_split=0.1,  # small split from combined for early stopping
                          epochs=100,
                          batch_size=32,
                          callbacks=[early_stop],
                          verbose=1)



In [None]:
import matplotlib.pyplot as plt

# Plot Accuracy
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Loss Over Epochs')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# 5. Evaluate on test set

# Predict probabilities and classes on test set
y_prob = final_model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Print metrics
print(f"Test Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_mat)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC: {roc_auc:.4f}")

# Visualize ROC Curve
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})", color='blue', linewidth=2)
plt.plot([0,1], [0,1], 'k--', label="Random Guessing")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

print("""
The ROC curve plots the True Positive Rate (Recall) against the False Positive Rate for different classification thresholds.
A curve closer to the top-left corner indicates better classification performance.
The AUC (Area Under Curve) summarizes the model's overall ability to discriminate between positive and negative classes:
- AUC = 1 means perfect classification,
- AUC = 0.5 means no better than random guessing.
Your model's AUC indicates how well it can distinguish diabetes cases from non-cases across all thresholds.
""")

# 6. Save the final trained model
final_model.save('final_best_model_with_dropout.h5')
print("✅ Final trained model saved as 'final_best_model_with_dropout.h5'")
