# 파마 인디언 당뇨병 예측

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import files
up = files.upload()

Saving pima-indians-diabetes.csv to pima-indians-diabetes.csv


- 데이터 전처리

In [3]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None) # 앞 9개 데이터 필요없어서 스킵
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)

# 위의 데이터 바탕으로 필드명 지정 
df.columns = ['P','G','BP','S','I','BMI','D', 'Age', 'Class']
df.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# 결측지 확인
df.isna().sum()

P        0
G        0
BP       0
S        0
I        0
BMI      0
D        0
Age      0
Class    0
dtype: int64

In [6]:
# 모든 컬럼의 겷측지 확인 
df.isna().sum().sum()

0

In [11]:
# .values 붙여 numpy ndarray 구조로 추출
# 안붙이면 X-데이터프레임, Y-시리즈 구조 
X = df.iloc[:, :-1].values 
y = df.iloc[:, -1].values

In [12]:
X.shape, y.shape # X-2차원, Y-차원

((768, 8), (768,))

- Train / Test daraset으로 분리

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_trian, y_test = train_test_split(X, y,
                                                    stratify=y, test_size=0.2, random_state=2022)

X_train.shape, X_test.shape, y_trian.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [14]:
np.unique(y_trian, return_counts=True)

(array([0, 1]), array([400, 214]))

In [15]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([100,  54]))

- `Decision Tree Classifier`

In [18]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2022)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2022,
 'splitter': 'best'}

In [19]:
params = {'max_depth':[2,3,4,5,6],
          'min_samples_split':[2,3,4]}

- `GridSearchCV`

In [22]:
from sklearn.model_selection import GridSearchCV
grid_dt = GridSearchCV(dtc, params, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_trian) # 학습

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2022),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [23]:
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 3}

In [25]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.6558441558441559

- 실제 적용 

In [26]:
X_test[0], y_test[0]

(array([  0.   , 135.   ,  94.   ,  46.   , 145.   ,  40.6  ,   0.284,
         26.   ]), 0)

In [28]:
best_dt.predict(X_test[0])
#=> 에러) 차원이 안맞아서!! 
# 파라미터는 2D array

ValueError: ignored

In [29]:
best_dt.predict(X_test) # 차원은 이게 맞아 

array([0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [30]:
X_test.shape , X_test[0].shape # shape 확인 

((154, 8), (8,))

In [31]:
X_test[0].reshape(1,-1) # reshape (-1) -> 알아서 맞춰라 ! 

array([[  0.   , 135.   ,  94.   ,  46.   , 145.   ,  40.6  ,   0.284,
         26.   ]])

In [33]:
pred = best_dt.predict(X_test[0].reshape(1,-1)) 
print('양성' if pred[0]==1 else '음성' )

음성
