# 피마 인디언 당뇨병 예측

In [2]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv', sep=',', skiprows=9, names=['times pregnant', '포도당 농도', '혈압', '피부두께', '2시간 혈청 인슐린', '체질량지수', '당뇨병혈통기능', '나이', '당뇨여부'])
df

Unnamed: 0,times pregnant,포도당 농도,혈압,피부두께,2시간 혈청 인슐린,체질량지수,당뇨병혈통기능,나이,당뇨여부
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [26]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, names=range(1,10))
df

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [17]:
df.loc[:, '1':'8'] # iloc로 자를 경우 [:, :9]가 됨. 혹은 :-1

Unnamed: 0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, '1':'8'], df[9], random_state=2021,
    test_size=0.2, stratify=df[9]
)

In [31]:
import numpy as np

In [32]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

In [33]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [34]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [35]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

In [36]:
pred = dtc.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7077922077922078

In [38]:
dtc.score(X_test, y_test)

0.7077922077922078

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
params = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3,4]
}

In [41]:
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [42]:
grid_dt.best_score_

0.7443328550932568

In [43]:
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [44]:
best_dt = grid_dt.best_estimator_

In [45]:
best_dt.score(X_test, y_test)

0.7142857142857143

### 수업

In [37]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None) # header=None으로 두면 숫자로 열 인덱스를 채워줌
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [35]:
pdf = df
pdf.columns = ['P', 'I', 'BP', 'ST', 'G', 'BMI', 'DPF', 'A', 'Class']
pdf.head()

Unnamed: 0,P,I,BP,ST,G,BMI,DPF,A,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [38]:
X = df.iloc[:,:-1]
y = df[8]
X.shape, y.shape

((768, 8), (768,))

In [39]:
type(y)

pandas.core.series.Series

In [40]:
X = df.iloc[:,:-1].values
y = df[8].values
X.shape, y.shape

((768, 8), (768,))

In [41]:
type(y)

numpy.ndarray

In [42]:
# 판다스 시리즈로 넣는 것과 .values를 붙여서 넘파이 어레이로 만드는 것의 차이 :
# 시리즈는 학습데이터와 테스트데이터를 나누어도 초기 시리즈 인덱스를 그대로 가져가지만 넘파이 배열의 경우 새로운 배열이 생성됨.
# 즉, 인덱싱을 할 때 넘파이 배열로 인덱싱 해야 함.
# 현 단계에서 큰 차이는 없음. 단 어떤 형태로 트레이닝&테스트 데이터셋을 나눴는지 기억할 것.

In [43]:
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([500, 268], dtype=int64))

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021,
)

In [45]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

- Model 생성 및 학습

In [46]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)

In [47]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

- 예측 및 평가

In [48]:
pred = dtc.predict(X_test)

In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7077922077922078

In [50]:
dtc.score(X_test, y_test)

0.7077922077922078

In [None]:
# 데이터 전처리 : 트레이닝 데이터셋, 테스트 데이터셋 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)
np.unique(y_train, return_counts=True)
# 모델 생성 및 학습
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(X_train, y_train)
# 예측 및 평가
pred = dtc.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)
dtc.score(X_test, y_test)

- 최적의 하이퍼 파라메터 도출 및 교차 검증

In [51]:
from sklearn.model_selection import GridSearchCV

In [53]:
params = {
    'max_depth': [2,4,6],
    'min_samples_split': [2,4,6]
}

In [54]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [55]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [56]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.7337662337662337

- 실제 값 하나가 주어졌을 때 당뇨병 여부를 확인하는 법

In [57]:
y_test[33]

0

In [58]:
X_test[33]

array([  0.   , 126.   ,  86.   ,  27.   , 120.   ,  27.4  ,   0.515,
        21.   ])

In [59]:
test_data = X_test[33]

In [63]:
result = best_dt.predict(test_data.reshape(-1,8))[0] # reshape 해서 2차원으로 차원을 맞춰줘야 함. best_dt.predict(test_data)처럼 쓰면 1차원 배열줬다고 에러.
print('음성' if result == 0 else '양성')

음성
