In [13]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [14]:
df = pd.read_csv('diabetic_data.csv')

In [15]:
df.tail()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO
101765,443867222,175429310,Caucasian,Male,[70-80),?,1,1,7,6,...,No,No,No,No,No,No,No,No,No,NO


In [16]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [17]:
column = ['encounter_id','patient_nbr','race','weight','admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses','readmitted']

In [18]:
df = df.drop(columns=column)

In [19]:
df.head()

Unnamed: 0,gender,age,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,Female,[0-10),,,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,Female,[10-20),,,No,No,No,No,No,No,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,Female,[20-30),,,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,Yes
3,Male,[30-40),,,No,No,No,No,No,No,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,Male,[40-50),,,No,No,No,No,No,No,...,No,No,Steady,No,No,No,No,No,Ch,Yes


In [20]:
df.columns

Index(['gender', 'age', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed'],
      dtype='object')

In [21]:
df.shape

(101766, 29)

In [22]:
df['A1Cresult'] = df['A1Cresult'].replace(0, 'NaN')
df['max_glu_serum'] = df['max_glu_serum'].replace(0,'NaN')

In [23]:
df.head()

Unnamed: 0,gender,age,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,Female,[0-10),,,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,Female,[10-20),,,No,No,No,No,No,No,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,Female,[20-30),,,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,Yes
3,Male,[30-40),,,No,No,No,No,No,No,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,Male,[40-50),,,No,No,No,No,No,No,...,No,No,Steady,No,No,No,No,No,Ch,Yes


In [24]:
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

In [25]:
df['max_glu_serum'] = df['max_glu_serum'].map({'None': 0, 'Norm': 1, '>300': 2, '>200': 3})
df['A1Cresult'] = df['A1Cresult'].map({'None': 0, 'Norm': 1, '>7': 2, '>8': 3})

In [26]:

medication_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change']

for col in medication_columns:
    df[col] = df[col].map({'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3})

In [31]:
# Define a mapping for age ranges to numeric values
age_mapping = {
    '[0-10)': 0,
    '[10-20)': 1,
    '[20-30)': 2,
    '[30-40)': 3,
    '[40-50)': 4,
    '[50-60)': 5,
    '[60-70)': 6,
    '[70-80)': 7,
    '[80-90)': 8,
    '[90-100)': 9
}

df['age'] = df['age'].map(age_mapping)
# Convert the entire DataFrame to float
df = df.astype(float)

In [32]:
from sklearn.preprocessing import LabelEncoder

# Handle categorical columns
categorical_columns = ['max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Convert the entire DataFrame to float
df = df.astype(float)

x = df.loc[:,df.columns!='diabetesMed']
y = df.loc[:,'diabetesMed']

# Split input matrix to create the training set (80%) and testing set (20%)
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)
# Second split on training set to create the validation set (20% of training set)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)


In [40]:
print(X_train.dtypes)


gender                      float64
age                         float64
max_glu_serum               float64
A1Cresult                   float64
metformin                   float64
repaglinide                 float64
nateglinide                 float64
chlorpropamide              float64
glimepiride                 float64
acetohexamide               float64
glipizide                   float64
glyburide                   float64
tolbutamide                 float64
pioglitazone                float64
rosiglitazone               float64
acarbose                    float64
miglitol                    float64
troglitazone                float64
tolazamide                  float64
examide                     float64
citoglipton                 float64
insulin                     float64
glyburide-metformin         float64
glipizide-metformin         float64
glimepiride-pioglitazone    float64
metformin-rosiglitazone     float64
metformin-pioglitazone      float64
change                      

In [41]:
print(y_train.dtypes)


float64


In [42]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)

In [43]:
# Check the shape of your data
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)


X_train shape: (65129, 28)
X_val shape: (16283, 28)
y_train shape: (65129,)
y_val shape: (16283,)


In [44]:
# Building the multilayer perceptron
print("\n* Building Multilayer Perceptron")
model = Sequential()

# Adding first hidden layer with 32 neurons
print(" - Adding first hidden layer with 32 neurons")
model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))

# Adding second hidden layer with 16 neurons
print(" - Adding second hidden layer with 16 neurons")
model.add(Dense(16, activation='relu'))

# Adding output layer
print(" - Adding output layer")
model.add(Dense(1, activation='sigmoid'))



* Building Multilayer Perceptron
 - Adding first hidden layer with 32 neurons
 - Adding second hidden layer with 16 neurons
 - Adding output layer


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [45]:
# Compile the network
print("\n* Compiling the network")
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



* Compiling the network


In [46]:
# Train the network
print("\n* Training the network")
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), verbose=1)


* Training the network
Epoch 1/10
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.7688 - loss: 0.6178 - val_accuracy: 0.7722 - val_loss: 0.5382
Epoch 2/10
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7695 - loss: 0.5403 - val_accuracy: 0.7722 - val_loss: 0.5367
Epoch 3/10
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7690 - loss: 0.5406 - val_accuracy: 0.7722 - val_loss: 0.5367
Epoch 4/10
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.7699 - loss: 0.5394 - val_accuracy: 0.7722 - val_loss: 0.5367
Epoch 5/10
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.7704 - loss: 0.5388 - val_accuracy: 0.7722 - val_loss: 0.5367
Epoch 6/10
[1m2036/2036[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7715 - loss: 0.5375 - val_accuracy: 0.7722 - val_loss:

In [None]:
# Evaluate the accuracy with respect to the training set
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
print('Training Accuracy: %.2f%%' % (train_accuracy * 100))
# Evaluate the accuracy with respect to the testing set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Testing Accuracy: %.2f%%' % (test_accuracy * 100))

Training Accuracy: 77.02%
Testing Accuracy: 77.01%
