In [1]:
import pandas as pd

df = pd.read_csv("./diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [43]:
import matplotlib.pyplot as plt

def plot_history(history):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
  ax1.plot(history.history['loss'], label='loss')
  ax1.plot(history.history['val_loss'], label='val_loss')
  ax1.set_xlabel('Epoch')
  ax1.set_ylabel('Binary crossentropy')
  ax1.grid(True)

  ax2.plot(history.history['accuracy'], label='accuracy')
  ax2.plot(history.history['val_accuracy'], label='val_accuracy')
  ax2.set_xlabel('Epoch')
  ax2.set_ylabel('Accuracy')
  ax2.grid(True)

  plt.show()

In [56]:
X = df.drop("Outcome", axis=1)
Y = df["Outcome"]

In [57]:
from sklearn.model_selection import train_test_split

# First split into train and remaining (test + validation)
X_train, X_rem, Y_train, Y_rem = train_test_split(X, Y, test_size=0.4, random_state=42)

# Then split the remaining into test and validation
X_valid, X_test, Y_valid, Y_test = train_test_split(X_rem, Y_rem, test_size=0.5, random_state=42)

In [58]:
import tensorflow as tf

def train_model(X_train, Y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs):
  model = tf.keras.Sequential([
      tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(8,)),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(num_nodes, activation='relu'),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

  history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)

  return model, history

In [59]:
least_val_loss = float('inf')
least_loss_model = None
epochs = 100

for num_nodes in [16, 32, 64]:
  for dropout_prob in [0, 0.2]:
    for learning_rate in [0.01, 0.005, 0.001]:
      for batch_size in [32, 64, 128]:
        print(f"{num_nodes} nodes, dropout {dropout_prob}, learning rate {learning_rate}, batch size {batch_size}")
        model, history = train_model(X_train, Y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs)
        plot_history(history)

        val_loss = model.evaluate(X_valid, Y_valid)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

Output hidden; open in https://colab.research.google.com to view.

In [60]:
predictions = least_loss_model.predict(X_test)
predictions = (predictions > 0.5).astype(int).reshape(-1,)
predictions

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [61]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(Y_test, predictions)
report = classification_report(Y_test, predictions, output_dict=True)

# Print accuracy with formatting
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\n") # Add an extra line for better readability

# Print classification report with formatting and explanations
report_df = pd.DataFrame(report).transpose()
report_df = report_df.drop(index=['accuracy', 'macro avg', 'weighted avg'])
report_df.index.name = "Outcome"

# Add explanations as comments using pd.concat instead of append
legend_df = pd.DataFrame([
    {"precision": "Precision: Out of all the patients the model predicted as having/not having diabetes, what proportion actually did/did not have diabetes.",
     "recall": "Recall: Out of all the patients who actually did/did not have diabetes, what proportion did the model correctly identify?",
     "f1-score": "F1-Score: Harmonic mean/average of precision and recall, balancing both.",
     "support": "Support: Number of actual instances in the test set for each class."}],
    index=["Legend"])

report_df = pd.concat([report_df, legend_df]) # Use concat to combine DataFrames


display(report_df)

Accuracy: 71.43%




Unnamed: 0,precision,recall,f1-score,support
0,0.723077,0.921569,0.810345,102.0
1,0.666667,0.307692,0.421053,52.0
Legend,Precision: Out of all the patients the model p...,Recall: Out of all the patients who actually d...,F1-Score: Harmonic mean/average of precision a...,Support: Number of actual instances in the tes...
