In [None]:
# -*- coding: utf-8 -*-
"""Diabetes Prediction with MLP - Tutorial"""

In [None]:
# Import necessary libraries
import pandas as pd  # Data manipulation
import numpy as np  # Array operations
import seaborn as sns  # Data visualization
import tensorflow as tf  # Deep learning framework
import matplotlib.pyplot as plt  # Plotting
from keras.utils import to_categorical  # Categorical encoding for labels
from sklearn.model_selection import train_test_split  # Data splitting
from sklearn.preprocessing import StandardScaler  # Data normalization
from keras.layers import Dense  # Dense layers for neural network
from keras.models import Sequential  # Model structure
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Evaluation metrics

In [None]:
# Load the dataset (Assuming the dataset is in a CSV file named 'diabetes.csv')
data = pd.read_csv('diabetes.csv')

# Display first few rows of the dataset to understand the structure
data.head()

In [None]:
# Check for data types and missing values
data.info()
data.isnull().sum()

In [None]:
# Visualize the distribution of target variable (Diabetes vs. No Diabetes)
f, ax = plt.subplots(1, 2, figsize=(12, 6))
f.suptitle("Diabetes Distribution", fontsize=18)
# Bar plot for Outcome distribution
data.Outcome.value_counts().plot.bar(ax=ax[0], color=['#3498db', '#e74c3c'], rot=0)
ax[0].set_xticklabels(["No", "Yes"])
# Pie chart for Outcome distribution
data.Outcome.value_counts().plot.pie(labels=["No", "Yes"], autopct="%.2f%%", label="", ax=ax[1],
                                     colors=['#3498db', '#e74c3c'], wedgeprops={"linewidth": 1.5, "edgecolor": "#F7F7F9"})
plt.show()

In [None]:
# Plot the distributions of all features to understand data spread and potential outliers
fig, ax = plt.subplots(4, 2, figsize=(16, 16))
sns.histplot(data.Pregnancies, bins=20, ax=ax[0, 0], color='#3498db').set_title("Pregnancies")
sns.histplot(data.Glucose, bins=20, ax=ax[0, 1], color='#e74c3c').set_title("Glucose")
sns.histplot(data.BloodPressure, bins=20, ax=ax[1, 0], color='#3498db').set_title("Blood Pressure")
sns.histplot(data.SkinThickness, bins=20, ax=ax[1, 1], color='#e74c3c').set_title("Skin Thickness")
sns.histplot(data.Insulin, bins=20, ax=ax[2, 0], color='#3498db').set_title("Insulin")
sns.histplot(data.BMI, bins=20, ax=ax[2, 1], color='#e74c3c').set_title("BMI")
sns.histplot(data.DiabetesPedigreeFunction, bins=20, ax=ax[3, 0], color='#3498db').set_title("Diabetes Pedigree Function")
sns.histplot(data.Age, bins=20, ax=ax[3, 1], color='#e74c3c').set_title("Age")
plt.tight_layout()
plt.show()

In [None]:
# Check correlation between features
plt.figure(figsize=(14, 10))
sns.heatmap(data.corr(), annot=True, cmap='YlGnBu', linewidths=0.5)
plt.title('Correlation between features')
plt.show()

In [None]:
# Split data into features (X) and labels (Y)
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

# Normalize the features for improved model performance
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=12)

In [None]:
# Create a Sequential MLP Model
model = Sequential()
model.add(Dense(32, input_dim=8, activation='relu'))  # Input layer with 32 neurons
model.add(Dense(16, activation='relu'))  # Hidden layer with 16 neurons
model.add(Dense(8, activation='relu'))  # Hidden layer with 8 neurons
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model using binary cross-entropy for binary classification
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Show model structure
model.summary()

In [None]:
# Train the model
hist = model.fit(X_train, Y_train, epochs=100, validation_data=(X_test, Y_test))

In [None]:
# Evaluate model accuracy on training and testing datasets
train_accuracy = model.evaluate(X_train, Y_train, verbose=0)
test_accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"Training Accuracy: {train_accuracy[1] * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy[1] * 100:.2f}%")

In [None]:
# Predict on test data and evaluate performance
Y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))
print("Classification Report:\n", classification_report(Y_test, Y_pred))

In [None]:
# Visualize confusion matrix as a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(Y_test, Y_pred), annot=True, fmt="d",
            xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"], cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Plot accuracy and loss curves
plt.figure(figsize=(12, 5))
plt.plot(hist.history['accuracy'], label='Training Accuracy')
plt.plot(hist.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(12, 5))
plt.plot(hist.history['loss'], label='Training Loss')
plt.plot(hist.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()