In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

diabetes_df = pd.read_csv('./datasets/pima_indians_diabetes.csv')

print(diabetes_df['Outcome'].value_counts())
diabetes_df.info()


Outcome
0    500
1    268
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
diabetes_df.duplicated().sum()

0

In [8]:
diabetes_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [14]:
# 0 값을 검사할 feature명
zero_features = ['Glucose', 'BloodPressure','SkinThickness','Insulin','BMI']

total_count = diabetes_df['Glucose'].count()

for feature in zero_features:
    zero_count = diabetes_df[diabetes_df[feature] == 0][feature].count()
    print(f'{feature} : {zero_count}건 {np.round(zero_count / total_count * 100, 4)}%')

Glucose : 5건 0.651%
BloodPressure : 35건 4.5573%
SkinThickness : 227건 29.5573%
Insulin : 374건 48.6979%
BMI : 11건 1.4323%


In [15]:
diabetes_df[zero_features] = diabetes_df[zero_features].replace(0, diabetes_df[zero_features].mean())

for feature in zero_features:
    zero_count = diabetes_df[diabetes_df[feature] == 0][feature].count()
    print(f'{feature} : {zero_count}건 {np.round(zero_count / total_count * 100, 4)}%')

Glucose : 0건 0.0%
BloodPressure : 0건 0.0%
SkinThickness : 0건 0.0%
Insulin : 0건 0.0%
BMI : 0건 0.0%


### 표준화와 정규화
##### 표준화(Standardization)
- 데이터의 분호를 정규 분포로 변경시켜준다(평균; 0, 표준 편차: 1).
- 다양한 기준으로 형성된 정규분포를 평균이 0 표준편차가 1인 하나의 기준으로 통합시켜 비교를 용이하게 하는 것이다.

##### 정규화(Normalize)
- 데이터의 범위 차이를 유지한 채 공통 척도로 변경하는 것이다.
- 필요한 feature에 대해서 같은 범위(0 ~ 1 또는 -1 ~ 1)를 가지게 할 때 사용한다.


#### 🚩표준화를 통해 이상치를 제거한 뒤 데이터를 정규화함으로써 상대적 크기에 대한 영향력을 줄인 다음 분석을 시작한다.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features, targets = diabetes_df.iloc[:, :-1], diabetes_df.Outcome

scaler = StandardScaler()

features_scalered = scaler.fit_transform(features)

X_train, X_test, y_train, y_test = train_test_split(features_scalered, targets, test_size = 0.2, stratify=targets, random_state=256)

decision_tree_classfier = DecisionTreeClassifier()

parameters = {'max_depth' : [5,6,7], 'min_samples_split' : [7,8,9]}

grid_decision_tree


