In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

diabetes_dataframe = pd.read_csv(
    "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
)

# Shuffle the data points
diabetes_dataframe = shuffle(diabetes_dataframe)

diabetes_dataframe_label = diabetes_dataframe.pop("Outcome")
diabetes_dataframe_features = diabetes_dataframe

# Split into training and testing sets
(
    diabetes_dataframe_training_features,
    diabetes_dataframe_test_features,
    diabetes_dataframe_training_label,
    diabetes_dataframe_test_label,
) = train_test_split(
    diabetes_dataframe_features, diabetes_dataframe_label, test_size=0.2
)

print(len(diabetes_dataframe))
print(len(diabetes_dataframe_training_features))
print(len(diabetes_dataframe_training_label))
print(len(diabetes_dataframe_test_features))
print(len(diabetes_dataframe_test_label))

768
614
614
154
154


In [3]:
from sklearn.preprocessing import StandardScaler

diabetes_dataframe_training_features_normalized = pd.DataFrame(
    StandardScaler().fit_transform(diabetes_dataframe_training_features),
    columns = diabetes_dataframe_training_features.columns
)

diabetes_dataframe_test_features_normalized = pd.DataFrame(
    StandardScaler().fit_transform(diabetes_dataframe_test_features),
    columns = diabetes_dataframe_test_features.columns
)

print(diabetes_dataframe_training_features.head())
print(diabetes_dataframe_training_features_normalized.head())

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
181            0      119             64             18       92  34.9   
614           11      138             74             26      144  36.1   
473            7      136             90              0        0  29.9   
606            1      181             78             42      293  40.0   
231            6      134             80             37      370  46.2   

     DiabetesPedigreeFunction  Age  
181                     0.725   23  
614                     0.557   50  
473                     0.210   50  
606                     1.258   22  
231                     0.238   46  
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0    -1.134457 -0.067244      -0.290700      -0.162162  0.104815  0.323747   
1     2.119099  0.524808       0.237846       0.340704  0.548712  0.481448   
2     0.935987  0.462487       1.083519      -1.293610 -0.680541 -0.333336   
3    -0.838679  1.864

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

column_count = len(diabetes_dataframe_training_features.columns)

model1 = Sequential()
model1.add(Dense(8, input_dim=column_count, activation='relu'))
model1.add(Dense(4, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

model1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model1.optimizer.learning_rate = 0.001

monitor_val_acc = EarlyStopping(monitor='accuracy', patience=10)

history = model1.fit(x=diabetes_dataframe_training_features_normalized,
          y=diabetes_dataframe_training_label, 
          epochs=100, 
          verbose=0,
          callbacks=[monitor_val_acc])

predictions = model1.predict(diabetes_dataframe_test_features_normalized)

diabetes_dataframe_test_results1 = diabetes_dataframe_test_features.copy()

diabetes_dataframe_test_results1['Outcome'] = diabetes_dataframe_test_label
diabetes_dataframe_test_results1['Predicted Outcome'] = predictions

print(diabetes_dataframe_test_results1.head())

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
325            1      157             72             21      168  25.6   
650            1       91             54             25      100  25.2   
734            2      105             75              0        0  23.3   
71             5      139             64             35      140  28.6   
278            5      114             74              0        0  24.9   

     DiabetesPedigreeFunction  Age  Outcome  Predicted Outcome  
325                     0.123   24        0           0.346259  
650                     0.234   23        0           0.171761  
734                     0.560   53        0           0.268190  
71                      0.411   26        0           0.472002  
278                     0.744   57        0           0.313219  


In [7]:
from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression(C=0.001, solver="liblinear")
model2.fit(
    diabetes_dataframe_training_features_normalized, diabetes_dataframe_training_label
)

prob = model2.predict_proba(diabetes_dataframe_test_features_normalized)

diabetes_dataframe_test_results2 = diabetes_dataframe_test_features.copy()

diabetes_dataframe_test_results2["Outcome"] = diabetes_dataframe_test_label
diabetes_dataframe_test_results2["Predicted Outcome"] = prob[:, 1]

print(diabetes_dataframe_test_results2.head())

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
325            1      157             72             21      168  25.6   
650            1       91             54             25      100  25.2   
734            2      105             75              0        0  23.3   
71             5      139             64             35      140  28.6   
278            5      114             74              0        0  24.9   

     DiabetesPedigreeFunction  Age  Outcome  Predicted Outcome  
325                     0.123   24        0           0.477118  
650                     0.234   23        0           0.418563  
734                     0.560   53        0           0.467319  
71                      0.411   26        0           0.492087  
278                     0.744   57        0           0.498982  
