In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./data/diabetes.csv')

In [4]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
df['Outcome'].unique()

array([1, 0], dtype=int64)

In [6]:
df['Outcome'].value_counts() # 이미 숫자형이라 LabelEncoder 활용할 필요가 없음.

0    500
1    268
Name: Outcome, dtype: int64

In [8]:
df_target = df.pop('Outcome') # 날리는 데이터를 df_target으로 보냄
df_target

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

계층적 데이터 추출옵션(분류모델에서 추천됨)

여러층으로 분할 후 각 층별로 랜덤 데이터를 추출, 

원래 데이터의 분포와 유사하게 데이터를 추출하는 방식

In [67]:
x_train, x_test, y_train, y_test = train_test_split(
    df, df_target, stratify=df_target,  test_size=0.2, random_state=42
) # staratify

In [68]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, stratify=y_train,  test_size=0.25, random_state=42
)

In [69]:
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape) # 768의 0.2
print (x_val.shape, y_val.shape) # 768의 0.25

(460, 8) (460,)
(154, 8) (154,)
(154, 8) (154,)


In [70]:
y_train.value_counts()

0    300
1    160
Name: Outcome, dtype: int64

In [71]:
y_test.value_counts()

0    100
1     54
Name: Outcome, dtype: int64

In [72]:
x_train.value_counts()

Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  BMI   DiabetesPedigreeFunction  Age
0            74       52             10             36       27.8  0.269                     22     1
5            116      74             0              0        25.6  0.201                     30     1
             115      76             0              0        31.2  0.343                     44     1
             114      74             0              0        24.9  0.744                     57     1
             112      66             0              0        37.8  0.261                     41     1
                                                                                                   ..
2            68       70             32             66       25.0  0.187                     25     1
             56       56             28             45       24.2  0.332                     22     1
1            181      78             42             293      40.0  1.258               

In [73]:
x_test.value_counts()

Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  BMI   DiabetesPedigreeFunction  Age
0            57       60             0              0        21.7  0.735                     67     1
5            124      74             0              0        34.0  0.220                     38     1
4            154      62             31             284      32.8  0.237                     23     1
             171      72             0              0        43.6  0.479                     26     1
             184      78             39             277      37.0  0.264                     31     1
                                                                                                   ..
2            88       74             19             53       29.0  0.229                     22     1
             90       68             42             0        38.2  0.503                     27     1
             91       62             0              0        27.3  0.525               

### MinMaxScaler

In [74]:
from sklearn.preprocessing import MinMaxScaler

In [75]:
scaler = MinMaxScaler()
scaler.fit(x_train)

MinMaxScaler()

In [76]:
x_train_scale = scaler.transform(x_train)
x_val_scale = scaler.transform(x_val)
x_train_scale.shape

(460, 8)

### Drop out

In [147]:
model = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=[ x_train_scale.shape[1] ]),
      tf.keras.layers.Dense(60, activation='relu'),
      tf.keras.layers.Dropout(0.3), # 70퍼센트만
      tf.keras.layers.Dense(30, activation='relu'),
      tf.keras.layers.Dense(1, activation='sigmoid')
      ])

#optimizer = tf.keras.optimizers.RMSprop(0.001)

model.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

In [148]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_8 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_24 (Dense)             (None, 60)                540       
_________________________________________________________________
dropout_8 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 30)                1830      
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 31        
Total params: 2,401
Trainable params: 2,401
Non-trainable params: 0
_________________________________________________________________


In [149]:
model.fit(x_train_scale, y_train, epochs=50, validation_data=(x_val_scale, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x15263bfbc88>

In [150]:
model.save('saved_model/my_model') # 모델 저장

INFO:tensorflow:Assets written to: saved_model/my_model\assets


In [151]:
x_test_scale = scaler.transform(x_test)
model.evaluate(x_test_scale,  y_test)



[0.4736891984939575, 0.7532467246055603]

In [152]:
model.predict_classes(x_test_scale)



array([[1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [153]:
x_test_scale[0]

array([0.5       , 0.8071066 , 0.56140351, 0.        , 0.        ,
       0.40834575, 0.0935412 , 0.31666667])

### 오차행렬

In [154]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [155]:
confusion_matrix(y_test ,model.predict_classes(x_test_scale) )

array([[86, 14],
       [24, 30]], dtype=int64)

In [156]:
accuracy_score(y_test,model.predict_classes(x_test_scale) )



0.7532467532467533

In [157]:
# 정밀도
# 예측을 Positive로 한 대상 중에 예측과 실제 값이 Positive로 일치한 데이터의 비율
precision_score(y_test ,model.predict_classes(x_test_scale) )



0.6818181818181818

In [158]:
# 재현율
# 실제 값이 Positive인 대상 중에 예측과 실제 값이 Positive로 일치한 데이터의 비율
recall_score(y_test,model.predict_classes(x_test_scale) )



0.5555555555555556

### 정밀도와 재현율 트레이드 오프

In [159]:
model.predict_proba(x_test_scale)



array([[0.6531359 ],
       [0.18741041],
       [0.2476933 ],
       [0.40174642],
       [0.0422397 ],
       [0.23059002],
       [0.3385483 ],
       [0.8783926 ],
       [0.09169453],
       [0.71856856],
       [0.24895132],
       [0.78860044],
       [0.16222593],
       [0.14302361],
       [0.36909896],
       [0.3798227 ],
       [0.49417904],
       [0.11836416],
       [0.8394594 ],
       [0.37807548],
       [0.22720891],
       [0.76192844],
       [0.19592416],
       [0.86748904],
       [0.4185335 ],
       [0.0879384 ],
       [0.6832818 ],
       [0.07094657],
       [0.33930945],
       [0.05767778],
       [0.05463523],
       [0.07311481],
       [0.46747744],
       [0.5142867 ],
       [0.7151557 ],
       [0.13739127],
       [0.26998895],
       [0.08453196],
       [0.4911453 ],
       [0.508893  ],
       [0.44859233],
       [0.30542856],
       [0.11735594],
       [0.39652905],
       [0.1600163 ],
       [0.31907874],
       [0.23308364],
       [0.129

In [160]:
from sklearn.preprocessing import Binarizer

In [161]:
custom_threshold = 0.5

In [162]:
pred_proba_1 = model.predict_proba(x_test_scale)[:,-1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)

In [163]:
confusion_matrix(y_test, custom_predict )

array([[86, 14],
       [24, 30]], dtype=int64)

In [164]:
for thre in [0.5, 0.4, 0.3, 0.2]:
    custom_threshold = thre
    pred_proba_1 = model.predict_proba(x_test_scale)[:,-1].reshape(-1,1)
    binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1)
    custom_predict = binarizer.transform(pred_proba_1)
    print (confusion_matrix(y_test, custom_predict ))
    print (precision_score(y_test, custom_predict ))
    print (recall_score(y_test, custom_predict ))

[[86 14]
 [24 30]]
0.6818181818181818
0.5555555555555556
[[77 23]
 [18 36]]
0.6101694915254238
0.6666666666666666
[[68 32]
 [ 8 46]]
0.5897435897435898
0.8518518518518519
[[50 50]
 [ 3 51]]
0.504950495049505
0.9444444444444444




In [165]:
from sklearn.metrics import precision_recall_curve
import numpy as np

In [166]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


def precision_recall_curve_plot(y_test , pred_proba_c1):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [167]:
# 정밀도와 재현율을 결합한 f1스코어
precision_recall_curve_plot( y_test, model.predict_proba(x_test_scale)[:, 1] )



IndexError: index 1 is out of bounds for axis 1 with size 1