In [39]:
import pandas as pd
import numpy as np
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Read the csv file
diab = pd.read_csv("/content/diabetes.csv")

In [6]:
# Display the first few rows of the DataFrame for initial data exploration
diab.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
# Get the dimensions of the DataFrame (rows, columns)
diab.shape

(768, 9)

In [8]:
# Get a summary of the DataFrame's information including data types, non-null counts, and memory usage
diab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
# Count the number of duplicate rows in the DataFrame
diab.duplicated().sum()

0

In [13]:
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [14]:
diab[columns_to_replace] =diab[columns_to_replace].replace(0, np.nan)

In [15]:
diab.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [16]:
fill_with_median = ['Glucose', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction', 'Age']
fill_with_mean = ['BloodPressure', 'BMI']

In [17]:
for col in fill_with_median:
    median_value = diab[col].median()
    diab[col].fillna(median_value, inplace = True)

for col in fill_with_mean:
    mean_value = diab[col].mean()
    diab[col].fillna(mean_value, inplace = True)

In [45]:
# Calculate IQR, lower bound, and upper bound for all columns in the dataset
Q1 = diab.quantile(0.25)
Q3 = diab.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outlier_mask = pd.Series(False, index=diab.index)
for col in diab.columns:
    is_outlier = (diab[col] < lower_bound[col]) | (diab[col] > upper_bound[col])
    outlier_mask |= is_outlier

diab_no_outliers = diab[~outlier_mask]

In [46]:
X = diab_no_outliers.drop('Outcome', axis=1)
y = diab_no_outliers['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Initialize and fit a StandardScaler to the training data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Use the same scaler to transform the test data
X_test_scaled = scaler.transform(X_test)

In [119]:
model=Sequential()
model.add(Dense(16, activation="relu", input_dim=X_train_scaled.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(12, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(8, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1,activation="sigmoid"))

In [120]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [121]:
history = model.fit(X_train_scaled,y_train, validation_data = (X_test_scaled, y_test), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [123]:
loss_test, accuracy_test = model.evaluate(X_test_scaled, y_test, verbose=0)
loss_train, accuracy_train = model.evaluate(X_train_scaled, y_train, verbose=0)
print('Testing Loss: %.2f' % loss_test)
print('Testing Accuracy: %.2f' % (accuracy_test * 100) + '%')
print("*" * 30)
print('Training Loss: %.2f' % loss_train)
print('Training Accuracy: %.2f' % (accuracy_train * 100) + '%')

Testing Loss: 0.54
Testing Accuracy: 72.00%
******************************
Training Loss: 0.47
Training Accuracy: 75.67%
