In [67]:
## import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve, classification_report

## neural network library
from keras.models import Sequential
from keras.layers import Dense, Dropout
from mlxtend.plotting import plot_confusion_matrix
import pickle

In [None]:
## Load the dataset
data = pd.read_csv('diabetes.csv')
data.head(5)

In [None]:
## Check for missing values

missing_values = data.isnull().sum()
missing_values

In [None]:
## check discriptive statistics vlaues

data.describe() 

In [None]:
## Here we plot a heatmap to visualize the correlation between features
corr = data.corr()
fig, ax = plt.subplots(figsize=(15, 10)) 
sns.heatmap(corr, annot=True, square=False, ax = ax , cmap = 'coolwarm')
plt.title('Correlation between features')
plt.tight_layout()
plt.show()

In [None]:
## we plot histograms to check the distribution of the features
data.hist(figsize=(15, 10))
plt.show()  

In [None]:
## follow-up blood Pressure distribution in population of Diabetes and Non-Diabetes patients
plt.figure(figsize=(16, 8))
ax = sns.histplot(data['BloodPressure'][data.Outcome == 0], kde = True, color='blue'), sns.histplot(data['BloodPressure'][data.Outcome == 1], kde = True, color='red')
plt.legend(['Non_Diabetes', 'Diabetes'])
plt.title('Blood Pressure distribution in Diabetes and Non-Diabetes patients')
plt.show()

In [None]:
## follow-up Age distribution in population of Diabetes and Non-Diabetes patients

plt.figure(figsize=(16, 8))
ax = sns.histplot(data['Age'][data.Outcome == 0], kde = True, color='blue'), sns.histplot(data['Age'][data.Outcome == 1], kde = True, color='red')
plt.legend(['Non_Diabetes', 'Diabetes'])
plt.title('Age distribution in Diabetes and Non-Diabetes patients')
plt.show()

In [None]:
##flow-up Glucose distribution in population of Diabetes and Non-Diabetes patients

plt.figure(figsize=(16, 8))
ax = sns.histplot(data['Glucose'][data.Outcome == 0], kde = True, color='blue'), sns.histplot(data['Glucose'][data.Outcome == 1], kde = True, color='red')
plt.legend(['Non_Diabetes', 'Diabetes'])
plt.title('Glucose distribution in Diabetes and Non-Diabetes patients')
plt.show()

In [None]:
## follow-up BMI distribution in population of Diabetes and Non-Diabetes patients

plt.figure(figsize=(16, 8))
ax = sns.histplot(data['BMI'][data.Outcome == 0], kde = True, color='blue'), sns.histplot(data['BMI'][data.Outcome == 1], kde = True, color='red')
plt.legend(['Non_Diabetes', 'Diabetes'])
plt.title('BMI distribution in Diabetes and Non-Diabetes patients')
plt.show()


In [38]:
##  defining the feature and target parameters

X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [41]:
## Scale up the data for prediction model

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [42]:
## Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
## train the model

model = GradientBoostingClassifier()
model.fit(X_train, y_train)
## evaluate the model and predict 

y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
accuracy


In [None]:
# Assuming y_test and y_pred are defined
cm = confusion_matrix(y_test, y_pred)

# Option 2: Using Seaborn's heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Diabetes', 'Diabetes'], yticklabels=['Non-Diabetes', 'Diabetes'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix of Gradient Boosting Classifier')
plt.show()


In [None]:
# Compute ROC-AUC Score

roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Plot ROC-AUC Curve

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc='upper left')
plt.title('Receiver Operating Characteristic (ROC) Curve for GBC')

In [None]:
print(classification_report(y_pred, y_test))

In [None]:
## Build Neural Network for dataset
## initialise the network   
model = Sequential()

# add the input with 64 neurons and an input shape of 10
model.add(Dense(units=36, input_dim=8, kernel_initializer='uniform', activation='relu'))
# add h1 hidden layer with 16 neurons
model.add(Dense(18, activation='relu',  kernel_initializer='uniform'))
# regularization to prevent overfitting
model.add(Dropout(0.4))
# add h2 hidden layer with 8 neurons 
model.add(Dense(6, activation='relu', kernel_initializer='uniform'))
# regularization to prevent overfitting
model.add(Dropout(0.2))
# add output layer with 1 neuron and sigmoid activation function
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))

# compile the model

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit the model

history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data= (X_test, y_test))




In [None]:
## val_accuracy

val_accuracy = history.history['val_accuracy']
accuracy = history.history['accuracy']

# print accuracy

print('Train Accuracy:', accuracy[-1])
print('Validation Accuracy:', val_accuracy[-1])

In [None]:
# plot the training and validation accuracy
plt.plot(accuracy)
plt.plot(val_accuracy)
plt.title('Training and Validation Accuracy')
plt.legend(['Training Accuracy', 'Validation Accuracy'], loc='best')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')

In [None]:
history_df = pd.DataFrame(history.history)

plt.plot(history_df.loc[:, ['loss']], label='Training loss')
plt.plot(history_df.loc[:, ['val_loss']], label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc="best")

In [None]:
# predict the test set results
y_pred_nn = model.predict(X_test)
y_pred = (y_pred_nn > 0.5)
np.set_printoptions()

In [None]:
## compute ROC-AUC Score and plot for Neural Network

roc_auc_nn = roc_auc_score(y_test, y_pred_nn)
fpr_nn, tpr_nn, thresholds_nn = roc_curve(y_test, y_pred_nn)

# Plot ROC-AUC Curve

plt.figure(figsize=(6, 5))
plt.plot(fpr_nn, tpr_nn, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_nn)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend(loc='upper left')
plt.title('Receiver Operating Characteristic (ROC) Curve for Neural Network')

plt.show()

In [None]:
## Confusion Matrix for Neural Network

cm_nn = confusion_matrix(y_test, y_pred)

# Option 2: Using Seaborn's heatmap

plt.figure(figsize=(6, 5))
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Diabetes', 'Diabetes'], yticklabels=['Non-Diabetes', 'Diabetes'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix of Neural Network')

plt.show()  



In [None]:
## classification report for Neural Network

print(classification_report(y_test, y_pred))