In [None]:
import numpy as np 
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

<img width="731" alt="스크린샷 2021-08-10 오후 4 22 45" src="https://user-images.githubusercontent.com/60789129/128825601-cb81e837-ee92-4a5d-a356-1a2f9bd5e3d6.png">

In [None]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
df = pd.read_csv(file_url)

In [None]:
df.head(2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1


In [None]:
df.shape

(303, 14)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(1), int64(12), object(1)
memory usage: 33.3+ KB


In [None]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [None]:
# 데이터 분리
val_df = df.sample(frac=0.2, random_state=123)
train_df = df.drop(val_df.index)

In [None]:
print(f"val_df shape : {val_df.shape} , train_df shape : {train_df.shape} ")

val_df shape : (61, 14) , train_df shape : (242, 14) 


In [None]:
np.random.seed(123)

In [None]:
val_label = val_df.pop("target")
train_label = train_df.pop("target")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# sklearn 데이터셋 분리 방법
x_train, x_val, y_train, y_val = train_test_split(df.drop("target", axis=1), df["target"], stratify=df["target"], test_size=0.2, random_state=123)

In [None]:
x_train.shape, y_train.shape

((242, 13), (242,))

In [None]:
# 범주형, 연속형 변수 구분
for col in x_train.columns:
    print(f"{col} : unique num {x_train[col].nunique()} ")
    print() 

age : unique num 37 

sex : unique num 2 

cp : unique num 5 

trestbps : unique num 46 

chol : unique num 138 

fbs : unique num 2 

restecg : unique num 3 

thalach : unique num 85 

exang : unique num 2 

oldpeak : unique num 36 

slope : unique num 3 

ca : unique num 4 

thal : unique num 5 



In [None]:
cat_col = "sex cp fbs restecg exang slope ca thal".split(" ")
cat_col 

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [None]:
con_col = list(set(x_train.columns) - set(cat_col))

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
one_hot = OneHotEncoder()
one_hot_encoded = one_hot.fit_transform(x_train[cat_col])

In [None]:
one_hot_encoded.toarray()

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [None]:
# 연속형 scaling
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled = scaler.fit_transform(x_train[con_col])

In [None]:
x_train = np.concatenate( [one_hot_encoded.toarray(), scaled], axis=1 )

In [None]:
x_train.shape

(242, 31)

In [None]:
cp.      cp_0 cp_1 cp_2 cp_3
0,         1.    0.  0.  0
1,         0.    1   0   0
2,         0.    0.  1.  0
3,         0     0.  0.  1

SyntaxError: ignored

In [None]:
# val 데이이터에도 적용
one_hot_encoded_val = one_hot.transform(x_val[cat_col])
scaled_val = scaler.transform(x_val[con_col])
x_val = np.concatenate([one_hot_encoded_val.toarray(), scaled_val], axis=1)

In [None]:
# model
model = keras.Sequential()

In [None]:
x_train.shape

(242, 31)

In [None]:
model.add(keras.layers.Dense(32, input_shape=[x_train.shape[1],], activation="relu"))
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid")) # sigmoid는 이진분류 task의 output layer에만 쓰임

In [None]:
model.compile(
    optimizer = "adam",
    loss = "binary_crossentropy",
    metrics = ["accuracy"]
)

In [None]:
batch_size = 32
epoch = 30

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                1024      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 1,569
Trainable params: 1,569
Non-trainable params: 0
_________________________________________________________________


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [None]:
model.fit(x_train, y_train, epochs=epoch, batch_size=batch_size, validation_data=(x_val, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fabd52fc110>

In [None]:
# test
sample = {
    "age": [60],
    "sex": [1],
    "cp": [1],
    "trestbps": [145],
    "chol": [233],
    "fbs": [1],
    "restecg": [2],
    "thalach": [150],
    "exang": [0],
    "oldpeak": [2.3],
    "slope": [3],
    "ca": [0],
    "thal": ["fixed"],
}

In [None]:
sample_df = pd.DataFrame(sample)

In [None]:
sample_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,60,1,1,145,233,1,2,150,0,2.3,3,0,fixed


In [None]:
sample_one_hot = one_hot.transform(sample_df[cat_col])
sample_scaled = scaler.transform(sample_df[con_col])
sample = np.concatenate([sample_one_hot.toarray(), sample_scaled], axis=1)
sample.shape

(1, 31)

In [None]:
prediction = model.predict(sample)

In [None]:
prediction

array([[0.21310705]], dtype=float32)

In [None]:
prediction[0][0]

0.21310705

In [None]:
print(f"심장병이 있을 확률 : {prediction[0][0] * 100:.2f}%")

심장병이 있을 확률 : 21.31%
