In [1]:
import pandas as pd
import numpy as np

### Pima Indians Diabetes Database
Source: https://www.kaggle.com/uciml/pima-indians-diabetes-database/

In [2]:
diabetes = pd.read_csv('../datasets/diabetes.csv')

diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


#### Describe data
Here we can see that for Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin and BMI min value is 0. For Pregnancies it is possible but for the rest it is impossible. So that means these values are missing values. 

In [6]:
diabetes.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,763.0,121.686763,30.535641,44.0,99.0,117.0,141.0,199.0
BloodPressure,733.0,72.405184,12.382158,24.0,64.0,72.0,80.0,122.0
SkinThickness,541.0,29.15342,10.476982,7.0,22.0,29.0,36.0,99.0
Insulin,394.0,155.548223,118.775855,14.0,76.25,125.0,190.0,846.0
BMI,757.0,32.457464,6.924988,18.2,27.5,32.3,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


#### Replace 0 values with NaN

In [5]:
diabetes['Glucose'].replace(0, np.nan, inplace= True)
diabetes['BloodPressure'].replace(0, np.nan, inplace= True)
diabetes['SkinThickness'].replace(0, np.nan, inplace= True)
diabetes['Insulin'].replace(0, np.nan, inplace= True)
diabetes['BMI'].replace(0, np.nan, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes['Glucose'].replace(0, np.nan, inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes['BloodPressure'].replace(0, np.nan, inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

#### Sum the null values in every column

In [7]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [8]:
arr = diabetes['SkinThickness'].values.reshape(-1, 1)

arr.shape

(768, 1)

### Univariate feature imputation
#### SimpleImputer
https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

##### Here Strategy = 'most_frequent' which means it will replace missing using the most frequent value in the column. 

In [11]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp.fit(diabetes['SkinThickness'].values.reshape(-1, 1))

diabetes['SkinThickness'] = imp.transform(diabetes['SkinThickness'].values.reshape(-1, 1))

diabetes['SkinThickness'].describe()

count    768.000000
mean      29.994792
std        8.886506
min        7.000000
25%       25.000000
50%       32.000000
75%       32.000000
max       99.000000
Name: SkinThickness, dtype: float64

##### Here Strategy = 'median' which means it will replace missing values using the median in the column. 

In [12]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')

imp.fit(diabetes['Glucose'].values.reshape(-1, 1))

diabetes['Glucose'] = imp.transform(diabetes['Glucose'].values.reshape(-1, 1))

diabetes['Glucose'].describe()

count    768.000000
mean     121.656250
std       30.438286
min       44.000000
25%       99.750000
50%      117.000000
75%      140.250000
max      199.000000
Name: Glucose, dtype: float64

In [None]:
##### Here Strategy = 'mean' which means it will replace missing values using the mean in the column. 

In [13]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

imp.fit(diabetes['BloodPressure'].values.reshape(-1, 1))

diabetes['BloodPressure'] = imp.transform(diabetes['BloodPressure'].values.reshape(-1, 1))

diabetes['BloodPressure'].describe()

count    768.000000
mean      72.405184
std       12.096346
min       24.000000
25%       64.000000
50%       72.202592
75%       80.000000
max      122.000000
Name: BloodPressure, dtype: float64

In [14]:
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=32)

imp.fit(diabetes['BMI'].values.reshape(-1, 1))

diabetes['BMI'] = imp.transform(diabetes['BMI'].values.reshape(-1, 1))

diabetes['BMI'].describe()

count    768.000000
mean      32.450911
std        6.875366
min       18.200000
25%       27.500000
50%       32.000000
75%       36.600000
max       67.100000
Name: BMI, dtype: float64

In [15]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,394.0,768.0,768.0,768.0,768.0
mean,3.845052,121.65625,72.405184,29.994792,155.548223,32.450911,0.471876,33.240885,0.348958
std,3.369578,30.438286,12.096346,8.886506,118.775855,6.875366,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.202592,32.0,125.0,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


#### Save the file in csv format

In [None]:
# diabetes.to_csv('datasets/diabetes_processed_incomplete.csv', index=False)