In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
diabetes=pd.read_csv('../Dataset/diabetes.csv')

diabetes.head(8)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0


In [4]:
# The values of "0" are replaced with NaN to be able to perform specific pandas functions
# Except in the Outcome. There 0 is a indicator of No. ALso except for Pregnancies. There 0 is valid value
#  So this columns should not be replaced
columns_to_replace = diabetes.columns.difference(['Outcome', "Pregnancies"])

diabetes_cleaned = diabetes.copy()
diabetes_cleaned[columns_to_replace] = diabetes_cleaned[columns_to_replace].replace(0, np.NaN)

diabetes_cleaned.head()



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [5]:
print("Find columns with missing values:")
missing_vals = [col for col in diabetes_cleaned if diabetes_cleaned[col].isnull().sum() > 0]
for col in missing_vals:
    print(col, diabetes_cleaned[col].isnull().sum(), "missing values")


Find columns with missing values:
Glucose 5 missing values
BloodPressure 35 missing values
SkinThickness 227 missing values
Insulin 374 missing values
BMI 11 missing values


In [6]:
# Imputation of data:
# if |round(skew)| <= 0.5 -> mean
# else if |round(skew)| >0.5 -> median

print(round(diabetes_cleaned[missing_vals].skew(),1))
# Mean is used when skewness is between -0.5 and 0.5, indicating a symmetric distribution. 
# In this case, the Mean is a good measure of central tendency.
# For skewness greater than 0.5 or less than -0.5, the distribution is skewed.
# The Median is preferred as it is more robust to outliers and better represents central tendency in skewed data.


diabetes_cleaned['Glucose'].fillna(diabetes_cleaned['Glucose'].mean(), inplace = True)

diabetes_cleaned['BloodPressure'].fillna(diabetes_cleaned['BloodPressure'].mean(), inplace = True)

diabetes_cleaned['SkinThickness'].fillna(diabetes_cleaned['SkinThickness'].median(), inplace = True)

diabetes_cleaned['Insulin'].fillna(diabetes_cleaned['Insulin'].median(), inplace = True)

diabetes_cleaned['BMI'].fillna(diabetes_cleaned['BMI'].median(), inplace = True)


diabetes_cleaned.head()

Glucose          0.5
BloodPressure    0.1
SkinThickness    0.7
Insulin          2.2
BMI              0.6
dtype: float64


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
