# 피마 인디언 당뇨병 예측

In [1]:
import numpy as np
import pandas as pd

In [3]:
from google.colab import files
up=files.upload()

Saving pima-indians-diabetes.csv to pima-indians-diabetes.csv


### 데이터 전처리

In [8]:
df=pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
# columns에 대한 설명

# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)

In [6]:
df.columns=['pregnant', 'glucose', 'BP', 'skin', 'I', 'BMI', 'D', 'AGE', 'Class']
df.head()

Unnamed: 0,pregnant,glucose,BP,skin,I,BMI,D,AGE,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
# 결측치 확인
df.isna().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

In [16]:
# 뒤에 values를 붙여주면 X와 y가 ndarray가 됨(이러면 정수 데이터가 실수 데이터가 되버림)
X=df.iloc[:, :-1].values
y=df.iloc[:,-1].values

In [22]:
X, y[:5]

(array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
        [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
        [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
        ...,
        [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
        [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
        [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]]),
 array([1, 0, 1, 0, 1]))

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(
    X,y,stratify=y, test_size=0.2, random_state=2022
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [24]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([400, 214]))

In [26]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([100,  54]))

### Decision Tree Classifier를 사용한 머신러닝

In [27]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(random_state=2022)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2022,
 'splitter': 'best'}

In [29]:
params={
    'max_depth':[2,3,4,5,6,7,8],
    'min_samples_split':[2,3,4,5,6,7,8]
}

#### GrdiSearchCV 적용

In [32]:
from sklearn.model_selection import GridSearchCV
grid_dt=GridSearchCV(dtc, params, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2022),
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8]},
             scoring='accuracy')

In [34]:
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 3}

In [36]:
best_dt=grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.6558441558441559

#### 훈련한 모델의 실제 적용

In [37]:
X_test[0], y_test[0]

(array([  0.   , 135.   ,  94.   ,  46.   , 145.   ,  40.6  ,   0.284,
         26.   ]), 0)

In [43]:
X_test.shape, X_test[0].shape
# 앞에건 2차원, 뒤에건 1차원

((154, 8), (8,))

In [47]:
# best_dt.predict(X_test[0]) 이 코드는 에러 발생! 넣어주는 데이터가 2차원 데이터가 되어야 함
best_dt.predict( X_test[0].reshape(1,-1) )  # 이 결과값도 array 형태의 데이터임

array([0])

In [49]:
pred=best_dt.predict(  X_test[0].reshape(1,-1) )

print('양성' if pred==1  else '음성')

음성
