In [20]:
import numpy as np # for linear algebra
import pandas as pd # data processing, CSV file I/O, etc
import seaborn as sns # for plots
import plotly.graph_objects as go # for plots
import plotly.express as px #for plots
import matplotlib.pyplot as plt # for visualizations and plots

# this eliminates the requirement to use plt.show() after every plot
%matplotlib inline

# changing the default figure sizes
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

import random # random library
pallete = ['Accent_r', 'Blues', 'BrBG', 'BrBG_r', 'BuPu', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'OrRd', 'Oranges', 'Paired', 'PuBu', 'PuBuGn', 'PuRd', 'Purples', 'RdGy_r', 'RdPu', 'Reds', 'autumn', 'cool', 'coolwarm', 'flag', 'flare', 'gist_rainbow', 'hot', 'magma', 'mako', 'plasma', 'prism', 'rainbow', 'rocket', 'seismic', 'spring', 'summer', 'terrain', 'turbo', 'twilight']

import os


In [21]:
df = pd.read_csv("raw/pima_diabetes.csv")
df.head() # displays the top 5 values in the dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [23]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [24]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

No Null

## Data Cleaning
In the above stats we can see that there are people with 0 BP (dead person?), 0 skin thickness (skeleton?) and 0 Glucose (how do you even survive?).

In [25]:
# Convert all 0 to NaN
df["Glucose"] = df["Glucose"].apply(lambda x: np.nan if x == 0 else x)
df["BloodPressure"] = df["BloodPressure"].apply(lambda x: np.nan if x == 0 else x)
df["SkinThickness"] = df["SkinThickness"].apply(lambda x: np.nan if x == 0 else x)
df["Insulin"] = df["Insulin"].apply(lambda x: np.nan if x == 0 else x)
df["BMI"] = df["BMI"].apply(lambda x: np.nan if x == 0 else x)

df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [26]:
df.isnull().sum()/len(df)*100

Pregnancies                  0.000000
Glucose                      0.651042
BloodPressure                4.557292
SkinThickness               29.557292
Insulin                     48.697917
BMI                          1.432292
DiabetesPedigreeFunction     0.000000
Age                          0.000000
Outcome                      0.000000
dtype: float64

In [27]:
# Distribution of data
df_cleaned = df.drop(columns=["Insulin", "SkinThickness"]) # drop due to high percentage of missing values
df_cleaned = df_cleaned.dropna() # Drop the row which missing value
df_cleaned.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,33.6,0.627,50,1
1,1,85.0,66.0,26.6,0.351,31,0
2,8,183.0,64.0,23.3,0.672,32,1
3,1,89.0,66.0,28.1,0.167,21,0
4,0,137.0,40.0,43.1,2.288,33,1


In [28]:
# Skew check
df_cleaned.skew()

Pregnancies                 0.905049
Glucose                     0.527117
BloodPressure               0.137629
BMI                         0.598186
DiabetesPedigreeFunction    1.921533
Age                         1.088526
Outcome                     0.658511
dtype: float64

In [29]:
# For highly skewed values
df_cleaned['DiabetesPedigreeFunction_log'] = np.log(df_cleaned['DiabetesPedigreeFunction'] + 1)
df_cleaned.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome,DiabetesPedigreeFunction_log
0,6,148.0,72.0,33.6,0.627,50,1,0.486738
1,1,85.0,66.0,26.6,0.351,31,0,0.300845
2,8,183.0,64.0,23.3,0.672,32,1,0.514021
3,1,89.0,66.0,28.1,0.167,21,0,0.154436
4,0,137.0,40.0,43.1,2.288,33,1,1.190279


In [30]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 724 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Pregnancies                   724 non-null    int64  
 1   Glucose                       724 non-null    float64
 2   BloodPressure                 724 non-null    float64
 3   BMI                           724 non-null    float64
 4   DiabetesPedigreeFunction      724 non-null    float64
 5   Age                           724 non-null    int64  
 6   Outcome                       724 non-null    int64  
 7   DiabetesPedigreeFunction_log  724 non-null    float64
dtypes: float64(5), int64(3)
memory usage: 50.9 KB


In [31]:
df_cleaned.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome,DiabetesPedigreeFunction_log
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,3.866022,121.882597,72.400552,32.467127,0.474765,33.350829,0.343923,0.367237
std,3.362803,30.75003,12.37987,6.888941,0.332315,11.765393,0.475344,0.198718
min,0.0,44.0,24.0,18.2,0.078,21.0,0.0,0.075107
25%,1.0,99.75,64.0,27.5,0.245,24.0,0.0,0.219136
50%,3.0,117.0,72.0,32.4,0.379,29.0,0.0,0.321358
75%,6.0,142.0,80.0,36.6,0.6275,41.0,1.0,0.487045
max,17.0,199.0,122.0,67.1,2.42,81.0,1.0,1.229641


## Features Engineering

In [32]:
df_cleaned['BMI_Category'] = pd.cut(df_cleaned['BMI'], 
                             bins=[0, 18.5, 25, 30, 100],
                             labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

df_cleaned['Age_Group'] = pd.cut(df_cleaned['Age'], 
                          bins=[0, 30, 40, 50, 100],
                          labels=['21-30', '31-40', '41-50', '50+'])

df_cleaned['Glucose_Category'] = pd.cut(df_cleaned['Glucose'],
                                 bins=[0, 100, 125, 300],
                                 labels=['Normal', 'Prediabetes', 'Diabetes'])

df_cleaned['BP_Category'] = pd.cut(df_cleaned['BloodPressure'],
                            bins=[0, 80, 90, 120, 200],
                            labels=['Normal', 'Elevated', 'High', 'Very High'])

df_cleaned['Pregnancy_Group'] = pd.cut(df_cleaned['Pregnancies'],
                                bins=[-1, 0, 3, 6, 20],
                                labels=['None', 'Low (1-3)', 'Medium (4-6)', 'High (7+)'])

df_cleaned['Clinical_Risk_Score'] = (
    (df_cleaned['BMI'] > 30).astype(int) +           # Obesity
    (df_cleaned['Glucose'] > 125).astype(int) +      # Prediabetes/Diabetes glucose
    (df_cleaned['BloodPressure'] > 90).astype(int) + # Hypertension
    (df_cleaned['Age'] > 35).astype(int)             # Older age
)

df_cleaned.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome,DiabetesPedigreeFunction_log,BMI_Category,Age_Group,Glucose_Category,BP_Category,Pregnancy_Group,Clinical_Risk_Score
0,6,148.0,72.0,33.6,0.627,50,1,0.486738,Obese,41-50,Diabetes,Normal,Medium (4-6),3
1,1,85.0,66.0,26.6,0.351,31,0,0.300845,Overweight,31-40,Normal,Normal,Low (1-3),0
2,8,183.0,64.0,23.3,0.672,32,1,0.514021,Normal,31-40,Diabetes,Normal,High (7+),1
3,1,89.0,66.0,28.1,0.167,21,0,0.154436,Overweight,21-30,Normal,Normal,Low (1-3),0
4,0,137.0,40.0,43.1,2.288,33,1,1.190279,Obese,31-40,Diabetes,Normal,,2


In [33]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 724 entries, 0 to 767
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Pregnancies                   724 non-null    int64   
 1   Glucose                       724 non-null    float64 
 2   BloodPressure                 724 non-null    float64 
 3   BMI                           724 non-null    float64 
 4   DiabetesPedigreeFunction      724 non-null    float64 
 5   Age                           724 non-null    int64   
 6   Outcome                       724 non-null    int64   
 7   DiabetesPedigreeFunction_log  724 non-null    float64 
 8   BMI_Category                  724 non-null    category
 9   Age_Group                     724 non-null    category
 10  Glucose_Category              724 non-null    category
 11  BP_Category                   724 non-null    category
 12  Pregnancy_Group               724 non-null    category


In [34]:
df_cleaned.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome,DiabetesPedigreeFunction_log,Clinical_Risk_Score
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,3.866022,121.882597,72.400552,32.467127,0.474765,33.350829,0.343923,0.367237,1.417127
std,3.362803,30.75003,12.37987,6.888941,0.332315,11.765393,0.475344,0.198718,1.021232
min,0.0,44.0,24.0,18.2,0.078,21.0,0.0,0.075107,0.0
25%,1.0,99.75,64.0,27.5,0.245,24.0,0.0,0.219136,1.0
50%,3.0,117.0,72.0,32.4,0.379,29.0,0.0,0.321358,1.0
75%,6.0,142.0,80.0,36.6,0.6275,41.0,1.0,0.487045,2.0
max,17.0,199.0,122.0,67.1,2.42,81.0,1.0,1.229641,4.0


In [35]:
df_cleaned.to_csv('new/pima.csv', index=False)