# Diabetes => Data Preparation

In [1]:
import pandas as pd

In [2]:
diabetes_data = pd.read_csv('diabetes.csv')
diabetes_data.sample(10, random_state = 9)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
456,1,135,54,0,0,26.7,0.687,62,0
260,3,191,68,15,130,30.9,0.299,34,0
651,1,117,60,23,106,33.8,0.466,27,0
143,10,108,66,0,0,32.4,0.272,42,1
666,4,145,82,18,0,32.5,0.235,70,1
390,1,100,66,29,196,32.0,0.444,42,0
591,2,112,78,50,140,39.4,0.175,24,0
77,5,95,72,33,0,37.7,0.37,27,0
414,0,138,60,35,167,34.6,0.534,21,1
494,3,80,0,0,0,0.0,0.174,22,0


In [3]:
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### Was fällt auf?
* Das Minimum von Glucose, BloodPressure, SkinThicknes, Insulin und BMI ist 0
* Erstes Quartal bei SkinThickness und Insulin ist 0
* Insulin hat einen maximalen Wert von 846

In [5]:
zero_count_dict = {}
for col in diabetes_data.columns:
    print('', col, diabetes_data[col].value_counts()[diabetes_data[col].value_counts().index == 0.0].values)  

 Pregnancies [111]
 Glucose [5]
 BloodPressure [35]
 SkinThickness [227]
 Insulin [374]
 BMI [11]
 DiabetesPedigreeFunction []
 Age []
 Outcome [500]


## 0 Werte entfernen

In [6]:
diabetes_drop = pd.read_csv('diabetes.csv')
diabetes_drop = diabetes_drop.drop(diabetes_drop[(diabetes_drop.Age != 0) & (diabetes_drop.Glucose != 0) & (diabetes_drop.SkinThickness != 0) & (diabetes_drop.BloodPressure != 0) & (diabetes_drop.Insulin != 0) & (diabetes_drop.BMI != 0)].index)
diabetes_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 376 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               376 non-null    int64  
 1   Glucose                   376 non-null    int64  
 2   BloodPressure             376 non-null    int64  
 3   SkinThickness             376 non-null    int64  
 4   Insulin                   376 non-null    int64  
 5   BMI                       376 non-null    float64
 6   DiabetesPedigreeFunction  376 non-null    float64
 7   Age                       376 non-null    int64  
 8   Outcome                   376 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 29.4 KB


## Werte ersetzen mit Median

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
simple_imputer = SimpleImputer(strategy='median',missing_values=0)

median_diabetes = pd.read_csv('diabetes.csv')

simple_imputer.fit(median_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']])

median_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = simple_imputer.transform(median_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']])

median_diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,125.0,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,29.0,125.0,30.1,0.349,47,1


In [9]:
median_diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.65625,72.386719,29.108073,140.671875,32.455208,0.471876,33.240885,0.348958
std,3.369578,30.438286,12.096642,8.791221,86.38306,6.875177,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Werte ersetzen mit Durchschnitt

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
simple_imputer = SimpleImputer(strategy='mean',missing_values=0)

mean_diabetes = pd.read_csv('diabetes.csv')

simple_imputer.fit(mean_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']])
mean_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = simple_imputer.transform(mean_diabetes[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']])

mean_diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.00000,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.00000,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.00000,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.00000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.00000,180.000000,32.9,0.171,63,0
764,2,122.0,70.0,27.00000,155.548223,36.8,0.340,27,0
765,5,121.0,72.0,23.00000,112.000000,26.2,0.245,30,0
766,1,126.0,60.0,29.15342,155.548223,30.1,0.349,47,1


In [12]:
mean_diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.435949,12.096346,8.790942,85.021108,6.875151,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.202592,29.15342,155.548223,32.4,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Überlegung:
Spalten mit wenigen 0 Werten zu entfernen und die anderen Spalten mit Werten ersetzen.

In [13]:
diabetes_data = pd.read_csv('diabetes.csv')
diabetes_data.drop(diabetes_data[diabetes_data.Glucose == 0].index, inplace=True)
diabetes_data.drop(diabetes_data[diabetes_data.BloodPressure == 0].index, inplace=True)
diabetes_data.drop(diabetes_data[diabetes_data.BMI == 0].index, inplace=True)
diabetes_data.drop(diabetes_data[diabetes_data.Age == 0].index, inplace=True)

simple_imputer = SimpleImputer(strategy='mean',missing_values=0)

simple_imputer.fit(diabetes_data[['SkinThickness', 'Insulin']])
diabetes_data[['SkinThickness', 'Insulin']] = simple_imputer.transform(diabetes_data[['SkinThickness', 'Insulin']])

diabetes_data.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,3.866022,121.882597,72.400552,29.182331,156.056122,32.467127,0.474765,33.350829,0.343923
std,3.362803,30.75003,12.37987,9.018907,87.395294,6.888941,0.332315,11.765393,0.475344
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,118.25,27.5,0.245,24.0,0.0
50%,3.0,117.0,72.0,29.182331,156.056122,32.4,0.379,29.0,0.0
75%,6.0,142.0,80.0,33.0,156.056122,36.6,0.6275,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### mean vs. median vs. most_frequent
* mean:
    * SkinThickness
        * mean: 29,18
        * min: 7
        * Erstes Quartal: 25
    * Insulin
        * mean: 156,05
        * min: 14
        * Erstes Quartal: 118,25
* median:
    * SkinThickness
        * mean: 29,13
        * min: 7
        * Erstes Quartal: 25
    * Insulin
        * mean: 142,04
        * min: 14
        * Erstes Quartal: 118,25
* most_frequent:
    * SkinThickness
        * mean: 29,93
        * min: 7
        * Erstes Quartal: 25
    * Insulin
        * mean: 132,64
        * min: 14
        * Erstes Quartal: 105

In [14]:
correlation = diabetes_data.corr().sort_values('Outcome', ascending=False)
correlation['Outcome']

Outcome                     1.000000
Glucose                     0.488384
BMI                         0.299375
Age                         0.245741
Pregnancies                 0.224417
Insulin                     0.219830
SkinThickness               0.216717
DiabetesPedigreeFunction    0.184947
BloodPressure               0.166703
Name: Outcome, dtype: float64