# 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/diabetes_feature.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,0,33.6,0.627,50,1,False,False,True,False,169.5,5.138735,False
1,1,85,66,29,0,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,True
2,8,183,64,0,0,23.3,0.672,32,1,True,False,True,False,169.5,5.138735,False
3,1,89,66,23,94,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


# 학습과 예측에 사용할 데이터셋 만들기

In [5]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [67]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
       'Insulin_nan', 'Insulin_log',
       'low_glu_insulin']]

In [68]:
y = df[['Outcome']]

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                    test_size=0.2, random_state=42)

In [70]:
X_train.shape, y_test.shape

((614, 11), (154, 1))

train_test_split으로 학습, 예측 셋을 나누면 DataFrame 형태로 만들진다. 근데 y_predict는 array이므로 나중에
y_test - y_predict를 할 때 데이터형태가 달라 연산을 할 수 없다. 그러므로 y_test.iloc[:,0]으로 인덱싱을 하여
series값으로 반환하면 연산을 할 수 있다.

23

In [71]:
X_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Pregnancies_high,Insulin_nan,Insulin_log,low_glu_insulin
60,84,0,0,0,0.0,0.304,21,False,102.5,4.639572,True
618,112,82,24,0,28.2,1.282,50,True,169.5,5.138735,False
346,139,46,19,83,28.7,0.654,22,False,83.0,4.430817,False
294,161,50,0,0,21.9,0.254,65,False,102.5,4.639572,False
231,134,80,37,370,46.2,0.238,46,False,370.0,5.916202,False
...,...,...,...,...,...,...,...,...,...,...,...
71,139,64,35,140,28.6,0.411,26,False,140.0,4.948760,False
106,96,122,0,0,22.4,0.207,27,False,102.5,4.639572,True
270,101,86,37,0,45.6,1.136,38,True,169.5,5.138735,False
435,141,0,0,0,42.4,0.205,29,False,169.5,5.138735,False


# 학습과 예측하기

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [82]:
# GrivdSearchCV 사용하기 
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(random_state=42)
param_grid = {'max_depth': range(3,12), 
              'max_features': [0.3, 0.5, 0.7, 0.9 ,1]}
clf = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=2)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 202 out of 225 | elapsed:    1.8s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:    1.8s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': range(3, 12),
                         'max_

In [83]:
clf.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=0.9, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [84]:
clf.best_params_

{'max_depth': 4, 'max_features': 0.9}

In [85]:
clf.best_score_

0.8974010395841663

In [89]:
pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.002992,4.370285e-07,0.000997,6.289914e-07,4,0.9,"{'max_depth': 4, 'max_features': 0.9}",0.878049,0.934959,0.878049,0.894309,0.901639,0.897401,0.020915,1
2,0.007382,0.001491159,0.001594,0.0004877478,3,0.7,"{'max_depth': 3, 'max_features': 0.7}",0.878049,0.926829,0.837398,0.886179,0.893443,0.88438,0.028756,2
3,0.007179,0.0003991856,0.002193,0.0003995073,3,0.9,"{'max_depth': 3, 'max_features': 0.9}",0.878049,0.918699,0.837398,0.894309,0.893443,0.88438,0.026855,3
6,0.003989,0.001092841,0.000798,0.000398995,4,0.5,"{'max_depth': 4, 'max_features': 0.5}",0.829268,0.902439,0.845528,0.886179,0.909836,0.87465,0.031783,4
24,0.004409,0.000787806,0.000997,4.370285e-07,7,1.0,"{'max_depth': 7, 'max_features': 1}",0.853659,0.894309,0.853659,0.861789,0.901639,0.873011,0.020728,5
18,0.005187,0.0003983753,0.001396,0.0004888509,6,0.9,"{'max_depth': 6, 'max_features': 0.9}",0.845528,0.926829,0.837398,0.878049,0.877049,0.872971,0.031501,6
1,0.006383,0.001197322,0.001198,0.000398046,3,0.5,"{'max_depth': 3, 'max_features': 0.5}",0.869919,0.902439,0.837398,0.878049,0.868852,0.871331,0.020844,7
33,0.005688,0.001243727,0.001397,0.0004885391,9,0.9,"{'max_depth': 9, 'max_features': 0.9}",0.845528,0.902439,0.821138,0.902439,0.877049,0.869719,0.032063,8
13,0.00379,0.0007459453,0.001197,0.0003989939,5,0.9,"{'max_depth': 5, 'max_features': 0.9}",0.837398,0.918699,0.837398,0.894309,0.836066,0.864774,0.034938,9
10,0.002593,0.0004888502,0.000599,0.0004887335,5,0.3,"{'max_depth': 5, 'max_features': 0.3}",0.845528,0.878049,0.813008,0.886179,0.893443,0.863241,0.029982,10


In [90]:
clf.predict(X_test)

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
      dtype=int64)

In [91]:
clf.score(X_test, y_test)

0.8961038961038961

In [94]:
# 이번에는 RandomizeSearchCV로 modeling 해보기
max_depth = np.random.randint(3,20,10)

max_features = np.random.uniform(0.7, 1.0, 100)

param_distributions = {'max_depth': max_depth,
                      'max_features': max_features,
                      'min_samples_split': list(range(2,7))}
param_distributions

{'max_depth': array([17,  7, 17, 17, 16,  7, 18, 18, 15,  4]),
 'max_features': array([0.86067723, 0.72554256, 0.84565396, 0.84743666, 0.91396942,
        0.8018118 , 0.95117247, 0.8609665 , 0.82256233, 0.87764788,
        0.911191  , 0.97083342, 0.93385468, 0.96705664, 0.70072276,
        0.71523681, 0.90164018, 0.983093  , 0.8103865 , 0.80066801,
        0.86683923, 0.82909686, 0.80100041, 0.70310312, 0.84726583,
        0.94851661, 0.98045238, 0.83381846, 0.8169087 , 0.83557521,
        0.92866423, 0.71500347, 0.74524185, 0.82352026, 0.79189299,
        0.97026609, 0.95303562, 0.89996097, 0.70152193, 0.87272089,
        0.97712011, 0.86294942, 0.72659679, 0.86314362, 0.80508684,
        0.87351635, 0.95735487, 0.81579358, 0.96203259, 0.82303778,
        0.72616047, 0.7948319 , 0.78605751, 0.73206343, 0.91516895,
        0.73601645, 0.96668614, 0.88595689, 0.74423361, 0.89640369,
        0.93436256, 0.7107648 , 0.73870643, 0.90998119, 0.86227275,
        0.91377948, 0.77658203, 0.880

In [96]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(model,
                        param_distributions = param_distributions,
                        n_iter=1000,
                        scoring='accuracy',
                        n_jobs=-1,
                        cv=5,
                        random_state=42)

clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=42,
            

In [97]:
clf.best_params_

{'min_samples_split': 3, 'max_features': 0.8497728024886211, 'max_depth': 4}

In [98]:
clf.best_score_

0.8974010395841663

In [81]:
from sklearn.metrics import accuracy_score

for max_depth in range(3, 12):
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    y_predict = model.fit(X_train, y_train).predict(X_test)
    score = accuracy_score(y_test, y_predict) * 100
    print(max_depth, score)

3 86.36363636363636
4 88.31168831168831
5 85.71428571428571
6 85.71428571428571
7 85.06493506493507
8 86.36363636363636
9 85.71428571428571
10 85.06493506493507
11 85.06493506493507


In [58]:
# model

# model.fit(X_train, y_train)

# y_predict = model.predict(X_test)
# y_predict

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=11, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [61]:
# y_predict = pd.Series(y_predict)

# 정답률 구하기

In [62]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.8506493506493507