uses Pima Indians Diabetes Database
https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

Supervised learning
Binary logistic regression + Neural Network


In [1]:
# import libraries
import numpy, pandas, matplotlib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import tensorflow

In [2]:
# read csv
df = pandas.read_csv("diabetes.csv")
df.head() #prints first 5 rows 


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
#graphing to see trends per label
# cols = df.columns.tolist()  
# feature_cols = cols[:-1]    

# for feature in feature_cols:
#     matplotlib.pyplot.hist(
#         df[df["Outcome"] == 1][feature],
#         color='blue',
#         label='Diabetes (1)',
#         alpha=0.7,
#         density=True
#     )
#     matplotlib.pyplot.hist(
#         df[df["Outcome"] == 0][feature],
#         color='red',
#         label='No Diabetes (0)',
#         alpha=0.7,
#         density=True
#     )
#     matplotlib.pyplot.title(feature)
#     matplotlib.pyplot.ylabel("Probability")
#     matplotlib.pyplot.xlabel(feature)
#     matplotlib.pyplot.legend()
#     matplotlib.pyplot.show()

DATA SEPARATION


In [None]:
# split data into x and y (pregnancies - age: x; outcome: y)
y = df["Outcome"].values
X = df.drop(columns=["Outcome"]).values

# separate data to get training, valid and test set
# 70% train, 20% valid, remain=10% test
X_temp, x_test, y_temp, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)
X_train, x_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.2222, stratify=y_temp, random_state=42)

scaler = StandardScaler().fit(X_train)
x_train = scaler.transform(X_train)
x_valid = scaler.transform(x_valid)
x_test  = scaler.transform(x_test )

ros = RandomOverSampler(random_state=42)
x_train, y_train = ros.fit_resample(x_train, y_train)

TRAIN LOGISTIC REGRESSION MODEL AND PREDICT

In [5]:
# build model
log_reg_model = LogisticRegression(max_iter=100, random_state=42)
log_reg_model.fit(x_train, y_train)

#predict
y_train_pred = log_reg_model.predict(x_train)
train_acc = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", round(train_acc, 4))
print("Train Classification Report:\n", classification_report(y_train, y_train_pred))

# Validation dataset 
y_valid_pred = log_reg_model.predict(x_valid)
valid_acc = accuracy_score(y_valid, y_valid_pred)
print("Validation Accuracy:", round(valid_acc, 4))
print("Validation Classification Report:\n", classification_report(y_valid, y_valid_pred))

# Test dataset 
y_test_pred = log_reg_model.predict(x_test)
test_acc = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", round(test_acc, 4))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

Train Accuracy: 0.7386
Train Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.76      0.74       350
           1       0.75      0.71      0.73       350

    accuracy                           0.74       700
   macro avg       0.74      0.74      0.74       700
weighted avg       0.74      0.74      0.74       700

Validation Accuracy: 0.7922
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.78      0.83       100
           1       0.67      0.81      0.73        54

    accuracy                           0.79       154
   macro avg       0.78      0.80      0.78       154
weighted avg       0.81      0.79      0.80       154

Test Accuracy: 0.7792
Test Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.74      0.81        50
           1       0.64      0.85      0.73        27

    accuracy   

Building a Neural Net using tensorflow

In [6]:
# visualize loss and accuracy
def plot_loss(history):
    matplotlib.pyplot.plot(history.history['loss'], label='loss')
    matplotlib.pyplot.plot(history.history['val_loss'], label='val_loss')
    matplotlib.pyplot.xlabel('Epoch')
    matplotlib.pyplot.ylabel('Binary Crossentropy')
    matplotlib.pyplot.legend()
    matplotlib.pyplot.grid(True)
    matplotlib.pyplot.show()

def plot_accuracy(history):
    matplotlib.pyplot.plot(history.history['accuracy'], label='accuracy')
    matplotlib.pyplot.plot(history.history['val_accuracy'], label='val_accuracy')
    matplotlib.pyplot.xlabel('Epoch')
    matplotlib.pyplot.ylabel('Accuracy')
    matplotlib.pyplot.legend()
    matplotlib.pyplot.grid(True)
    matplotlib.pyplot.show()

In [9]:
features = x_train.shape[1]
# num_nodes = 17 #2n+1 n=num of features

# build a function which can be called to train different FNN models with different arguments so that the best can be found
def train_model(x_train, y_train, num_nodes, dropout_prob, learning_rate, batch_size, epochs):
    fnn_model = tensorflow.keras.Sequential([
            tensorflow.keras.layers.Input(shape=(features,)),  # hyperparameters, tuple
            tensorflow.keras.layers.Dropout(dropout_prob),
            tensorflow.keras.layers.Dense(num_nodes, activation='relu'),
            tensorflow.keras.layers.Dense(num_nodes, activation='relu', ),
            tensorflow.keras.layers.Dense(1, activation='sigmoid') # can easily round this to (0, or 1)
        ])

    fnn_model.compile(optimizer=tensorflow.keras.optimizers.Adam(learning_rate), 
                      loss='binary_crossentropy',
                    metrics=['accuracy']) #Optimizers will adjust the weight of the model based on loss function hence +accuracy

    history = fnn_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_valid, y_valid), verbose=0)

    return fnn_model, history

In [None]:
# grid to find best model out of many trained models

least_val_loss = float('inf')
least_loss_model = None
epochs = 100

for num_nodes in [17, 64]:
    for dropout_prob in [0.0, 0.2]:
        for learning_rate in [0.005, 0.001, 0.01]:
            for batch_size in [64, 128]:
                print(f"{num_nodes} nodes, {dropout_prob} dropout, {learning_rate} learning rate, {batch_size} batch size")
                model, history = train_model(
                    x_train, y_train,
                    num_nodes=num_nodes,
                    dropout_prob=dropout_prob,
                    learning_rate=learning_rate,
                    batch_size=batch_size,
                    epochs=epochs
                )

                # plot_loss(history)
                # plot_accuracy(history)

                validation_loss, validation_acc = model.evaluate(x_valid, y_valid, verbose=0)
                print(f" validation loss is {validation_loss}, validation accuracy is {validation_acc}")

                # keep best 
                if validation_loss < least_val_loss:
                    least_val_loss = validation_loss
                    least_loss_model = model


17 nodes, 0.0 dropout, 0.005 learning rate, 64 batch size
 validation loss is 1.0405486822128296, validation accuracy is 0.6948052048683167
17 nodes, 0.0 dropout, 0.005 learning rate, 128 batch size
 validation loss is 0.795328676700592, validation accuracy is 0.7467532753944397
17 nodes, 0.0 dropout, 0.001 learning rate, 64 batch size


In [None]:
# final test run with the best model
y_prob = least_loss_model.predict(x_test, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int).reshape(-1,) # round to 0 or 1. threshold=0.5

print("Final Model Classification Score")
print(classification_report(y_test, y_pred, zero_division=0))

Final Model Classification Report
              precision    recall  f1-score   support

           0       0.89      0.59      0.71        54
           1       0.46      0.83      0.59        23

    accuracy                           0.66        77
   macro avg       0.68      0.71      0.65        77
weighted avg       0.76      0.66      0.68        77

