# Diabetes Diagnosis

### <u>Dataset Description</u>
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. <a href="https://www.kaggle.com/datasets/mathchi/diabetes-data-set">Kaggle Link</a> The objective is to predict based on diagnostic measurements whether a patient has diabetes.

In [2]:
import numpy as np
import pandas as pd
import pickle

import os
for dirname, _, filenames in os.walk('dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# To turn off warning messages
import warnings
warnings.filterwarnings('ignore')

## Importing Dataset

In [None]:
import pandas as pd
df = pd.read_csv('../dataset/diabetes.csv.xls')
df.head()

## Exploratory Data Analysis (EDA)

In [6]:
print(df.shape)
print('======================================')
print(df.columns)
print('======================================')
df.info()

(768, 9)
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


## Calculate the correlation between features

In [6]:
# Calculate correlation matrix
corr = df.corr()
# constructing a heatmap to nderstand the correlation
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,12))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, 
            annot_kws={'size':8}, cmap='Blues')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [7]:
df1 = df.astype(float)
df1.head()

df duplicated values:  0
df missing values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [8]:
# Check duplicated values:
print('df duplicated values: ',df1.duplicated().sum())
print('======================================')
# Check missing values:
print('df missing values:\n',df1.isnull().sum())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [10]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


## Check the features with 0 value

In [11]:
zero_counts = df1.eq(0).sum()
print(zero_counts)

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64


## Replace 0 values in important features with median value

In [12]:
df1['Glucose'] = df1['Glucose'].replace(0, np.median(df1['Glucose']))
df1['BloodPressure'] = df1['BloodPressure'].replace(0, np.median(df1['BloodPressure']))
df1['SkinThickness'] = df1['SkinThickness'].replace(0, np.median(df1['SkinThickness']))
df1['Insulin'] = df1['Insulin'].replace(0, np.median(df1['Insulin']))
df1['BMI'] = df1['BMI'].replace(0, np.median(df1['BMI']))

## Create Logistic Regression Model

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Split features and target

In [14]:
X = df1.drop('Outcome', axis=1) 
y = df1['Outcome']

## Split train and test sets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature scaling

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Train model

In [17]:
model = LogisticRegression(solver = 'lbfgs')
model.fit(X_train, y_train)

## Evaluation model

In [19]:
y_pred = model.predict(X_test)
print('Model Accuracy: ', '%', accuracy_score(y_test, y_pred) * 100)

Model Accuracy:  % 81.81818181818183


## Save Model

In [18]:
filename = '../models/finalized_lg_model_diabetes.pkl'
pickle.dump(model, open(filename,'wb'))

In [19]:
loaded_model = pickle.load(open(filename, 'rb'))

In [20]:
person_data_diabetes = {
    'Pregnancies': 5,
    'Glucose': 150,
    'BloodPressure': 80,
    'SkinThickness': 35,
    'Insulin': 220,
    'BMI': 34.5,
    'DiabetesPedigreeFunction': 0.56,
    'Age': 45
}

It is predicted that this patient would have diabetes.


In [20]:
input_df = pd.DataFrame([person_data_diabetes])
pred = loaded_model.predict(input_df)[0]
if pred == 0:
    print('It is predicted that this patient does not have diabetes.')
else: 
    print('It is predicted that this patient would have diabetes.')