In [17]:
import pandas as pd
import numpy as np

In [3]:
diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [8]:
duple_list = list(diabetes.columns)
mask = diabetes.duplicated(subset=duple_list)
diabetes_duplicates = diabetes[mask]
print (diabetes_duplicates.shape[0])

10


In [10]:
diabetes_dedupped=diabetes.drop_duplicates(subset=duple_list)
diabetes_dedupped.shape[0]

768

In [11]:
low_information_cols=[]

for col in diabetes_dedupped.columns:
    top_freq = diabetes_dedupped[col].value_counts(normalize=True).max()
    nunique_ratio = diabetes_dedupped[col].nunique() / diabetes_dedupped[col].count()
    if top_freq > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(top_freq*100, 2)}% одинаковых значений')
    # сравниваем долю уникальных значений с порогом
    if nunique_ratio > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(nunique_ratio*100, 2)}% уникальных значений')

Gender: 100.0% одинаковых значений


In [13]:
diabetes_df=diabetes_dedupped.drop(low_information_cols, axis=1)

In [15]:
diabetes_df.isnull().mean()

Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64

In [19]:
diabetes_df['Glucose'] = diabetes_df['Glucose'].apply(lambda x: np.nan if x == 0 else x)
diabetes_df['BloodPressure'] = diabetes_df['BloodPressure'].apply(lambda x: np.nan if x == 0 else x)
diabetes_df['SkinThickness'] = diabetes_df['SkinThickness'].apply(lambda x: np.nan if x == 0 else x)
diabetes_df['Insulin'] = diabetes_df['Insulin'].apply(lambda x: np.nan if x == 0 else x)
diabetes_df['BMI'] = diabetes_df['BMI'].apply(lambda x: np.nan if x == 0 else x)

105.0    11
140.0     9
130.0     9
120.0     8
180.0     7
         ..
485.0     1
370.0     1
342.0     1
65.0      1
235.0     1
Name: Insulin, Length: 185, dtype: int64

In [28]:
round(diabetes_df['Insulin'].isnull().mean(), 2)

0.49

In [36]:
diabetes_df.shape[1]

9

In [49]:
for col in diabetes_df.columns:
    low_values = diabetes_df[col].isnull().mean()*100

    if low_values > 30:
        diabetes_df = diabetes_df.drop(columns=col)


In [50]:
diabetes_df.shape[1]

8

In [79]:
cop=diabetes_df.copy()

In [81]:
cop = cop.dropna(thresh=6, axis=0)

In [82]:
cop.shape

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,34.0,0.430,43,0
1,2,112.0,75.0,32.0,35.7,0.148,21,0
2,2,108.0,64.0,,30.8,0.158,21,0
3,8,107.0,80.0,,24.6,0.856,34,0
4,7,136.0,90.0,,29.9,0.210,50,0
...,...,...,...,...,...,...,...,...
763,5,139.0,64.0,35.0,28.6,0.411,26,0
764,1,96.0,122.0,,22.4,0.207,27,0
765,10,101.0,86.0,37.0,45.6,1.136,38,1
766,0,141.0,,,42.4,0.205,29,1


In [83]:
values={
    'Pregnancies': cop['Pregnancies'].median(),
    'Glucose': cop['Glucose'].median(),
    'BloodPressure': cop['BloodPressure'].median(),
    'SkinThickness': cop['SkinThickness'].median(),
    'BMI': cop['BMI'].median(),
    'DiabetesPedigreeFunction': cop['DiabetesPedigreeFunction'].median(),
    'Age': cop['Age'].median(),
    'Outcome': cop['Outcome'].median()
}
data=cop.fillna(values)

In [84]:
data['SkinThickness'].mean()

29.109067017082786

In [85]:
def outliers_iqr(data, feature):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    outliers = data[(x<lower_bound) | (x > upper_bound)]
    cleaned = data[(x>lower_bound) & (x < upper_bound)]
    return outliers, cleaned

In [88]:
outliers, cleaned = outliers_iqr(data, 'SkinThickness')
outliers.count()

Pregnancies                 87
Glucose                     87
BloodPressure               87
SkinThickness               87
BMI                         87
DiabetesPedigreeFunction    87
Age                         87
Outcome                     87
dtype: int64

In [97]:
def outliers_z_score(data, feature, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

In [94]:
outliers, cleaned = outliers_z_score(data, 'SkinThickness')
print (outliers.shape[1])

8


In [95]:
outliers, cleaned = outliers_iqr(data, 'DiabetesPedigreeFunction')
outliers.count()

Pregnancies                 29
Glucose                     29
BloodPressure               29
SkinThickness               29
BMI                         29
DiabetesPedigreeFunction    29
Age                         29
Outcome                     29
dtype: int64

In [99]:
outliers, cleaned = outliers_z_score(data, 'DiabetesPedigreeFunction', log_scale=True)
outliers.count()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64