In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 경고표시 지우기
import warnings
warnings.filterwarnings(action="ignore")

# matplotlib는 한글 폰트가 지원이 안됨. 지원되게하려면.
import matplotlib.font_manager as fm
font_name = fm.FontProperties(fname = "C:/Windows/Fonts/malgun.ttf").get_name() # 맑은고딕폰트이름

# 폰트를 지정해주는 작업
plt.rc("font", family = font_name)

# 한글 폰트때문에 깨져보이는 -를 다시 보일 수 있게 해주는 작업
import matplotlib as mpl
mpl.rcParams["axes.unicode_minus"] = False

### 관상동맥 위험 예측

* https://www.kaggle.com/jiantay33/ten-year-coronary-risk-prediction

In [96]:
# 심질환 데이터
# male : 성별
# age : 나이
# education : 최고학력
# currentSmoker : 최근 담배를 피었는가 여부
# cigsPerDay : 하루에 피운 담배 수
# BPMeds : 혈압약 복용 여부
# prevalentStroke : 뇌졸중 기록
# prevalentHyp : 고혈압 기록
# diabetes : 당뇨병 기록
# totChol : 콜레스테롤 단계
# sysBP - 혈압 수준입니다.
# diaBP - 확장기 혈압입니다.
# BMI - 체질량 지수
# heartRate - 심박수 판독값입니다.
# gluscose - 포도당 수치입니다.
# TenYearCHD - 향후 10년 내에 관상동맥 질환의 위험을 겪을 수 있는지 여부입니다.

data = pd.read_csv("./data/coronary_prediction.csv")

data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [97]:
columns = ["sex","age","education","currentSmoker","cigsPerDay","BPMeds","prevalentStroke","prevalentHyp","diabetes","totChol","sysBP","diaBP","BMI","heartRate","glucose"]

In [98]:
df = data.rename(columns = {"male":"sex"})

In [99]:
#### 결측치 확인

df.isna().sum()
# 결측치는 편의상 제거.
df.dropna(inplace=True)
print(df.isna().sum())

print(len(df))

sex                0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64
3656


In [100]:
#### 성별에 따른 관상동맥 질환의 위험성
# male 컬럼 및 데이터 변경
# df["sex"] = df["sex"].map({0:"남자", 1:"여자"})

df["sex"].value_counts() # 남자 2034 / 여자 1622

df.groupby("sex")["TenYearCHD"].value_counts() # 남자는 2034 중 250명이 걸리고 여자는 1622명 중 307명이 걸림.

sex  TenYearCHD
0    0             1784
     1              250
1    0             1315
     1              307
Name: TenYearCHD, dtype: int64

In [101]:
#### 흡연유무에 따른 관상동맥 질환
# 최근 흡연 유무
df["currentSmoker"].value_counts() # 핀적없음 : 1868 / 핀적있음 : 1788

df.groupby("currentSmoker")["TenYearCHD"].value_counts()

currentSmoker  TenYearCHD
0              0             1596
               1              272
1              0             1503
               1              285
Name: TenYearCHD, dtype: int64

In [102]:
#### 상관관계 확인
# 0.3이상이면 관계가 있다.
# 음의 상관관계는 빼고
# 0.1이 넘지않는 컬럼은 삭제.
for col in columns:
    print("col : ", col, np.corrcoef(df[col], df["TenYearCHD"])[0])
    
print("-"*70)

# 삭제할 컬럼들
del_cols = ["sex","education","currentSmoker", "cigsPerDay", "BPMeds", "prevalentStroke", "diabetes", "totChol", "BMI", "heartRate"]

for del_col in del_cols:
    del df[del_col]

col :  sex [1.         0.09174489]
col :  age [1.         0.23381045]
col :  education [ 1.         -0.06306773]
col :  currentSmoker [1.        0.0191762]
col :  cigsPerDay [1.         0.05215873]
col :  BPMeds [1.        0.0891157]
col :  prevalentStroke [1.         0.04835057]
col :  prevalentHyp [1.        0.1815564]
col :  diabetes [1.         0.09339742]
col :  totChol [1.         0.09112675]
col :  sysBP [1.         0.22288534]
col :  diaBP [1.         0.15034173]
col :  BMI [1.         0.08193118]
col :  heartRate [1.         0.02052342]
col :  glucose [1.         0.12194204]
----------------------------------------------------------------------


In [103]:
df.head()

Unnamed: 0,age,prevalentHyp,sysBP,diaBP,glucose,TenYearCHD
0,39,0,106.0,70.0,77.0,0
1,46,0,121.0,81.0,76.0,0
2,48,0,127.5,80.0,70.0,0
3,61,1,150.0,95.0,103.0,1
4,46,0,130.0,84.0,85.0,0


In [104]:
#### 혈압과 관상동맥 질환의 관계

# sysBP(수축했을때 혈압)의 평균
print(df.groupby(["TenYearCHD"])["sysBP"].mean())
print("-"*70)
# diaBP(확장됐을때 혈압)의 평균
print(df.groupby(["TenYearCHD"])["diaBP"].mean())

TenYearCHD
0    130.280736
1    143.981149
Name: sysBP, dtype: float64
----------------------------------------------------------------------
TenYearCHD
0    82.148919
1    87.157989
Name: diaBP, dtype: float64


In [105]:
# 고혈압 기록과 관상동맥 질환의 관계
df.groupby(["TenYearCHD"])["prevalentHyp"].sum()

TenYearCHD
0    855
1    284
Name: prevalentHyp, dtype: int64

In [121]:
# 나이대별 관상동맥 질환 확인
print(df["age"].max()) # 가장 나이 많은 사람은 70살
print(df["age"].min()) # 가장 나이가 적은 사람은 32살.

df["age_range"] = pd.cut(df["age"],  bins=[30,39,49,59,69,79], labels=["30대","40대","50대","60대","70대"])

print(df.groupby(["age_range"])["TenYearCHD"].sum())
'''
10대      0
20대      0
30대      0
40대     32
50대    146
60대    236
70대    143
80대      0
90대      0
'''
print(len(df))

df.head()

70
32
age_range
30대     21
40대    138
50대    230
60대    168
70대      0
Name: TenYearCHD, dtype: int64
3656


Unnamed: 0,age,prevalentHyp,sysBP,diaBP,glucose,TenYearCHD,age_range
0,39,0,106.0,70.0,77.0,0,30대
1,46,0,121.0,81.0,76.0,0,40대
2,48,0,127.5,80.0,70.0,0,40대
3,61,1,150.0,95.0,103.0,1,60대
4,46,0,130.0,84.0,85.0,0,40대


#### 학습

In [123]:
## Normalization - Data preprocessing

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [129]:
#del df["age_range"]

scaler = MinMaxScaler()
scaled_cols = list(df.columns)
df_scaled = scaler.fit_transform(df[scaled_cols])
df_scaled = pd.DataFrame(df_scaled)
df_scaled.columns = scaled_cols

Unnamed: 0,age,prevalentHyp,sysBP,diaBP,glucose,TenYearCHD
0,0.184211,0.0,0.106383,0.232804,0.104520,0.0
1,0.368421,0.0,0.177305,0.349206,0.101695,0.0
2,0.421053,0.0,0.208038,0.338624,0.084746,0.0
3,0.763158,1.0,0.314421,0.497354,0.177966,1.0
4,0.368421,0.0,0.219858,0.380952,0.127119,0.0
...,...,...,...,...,...,...
3651,0.684211,1.0,0.271868,0.349206,0.115819,0.0
3652,0.947368,1.0,0.399527,0.518519,0.110169,1.0
3653,0.473684,1.0,0.451537,0.465608,0.129944,1.0
3654,0.500000,0.0,0.203310,0.338624,0.079096,0.0


In [133]:
X_train = df_scaled[["age", "prevalentHyp", "sysBP", "diaBP", "glucose"]].values
Y_train = df_scaled["TenYearCHD"].values

x_train, x_test, y_train, y_test = train_test_split(X_train,Y_train,test_size=0.4, shuffle=True)

In [137]:
### 학습
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
        Dense(124, input_shape=[5], activation='relu'),
        Dense(62, activation='relu'),
        Dropout(0.3),
        Dense(15, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train, y_train, validation_data=(x_test,y_test), epochs=50, batch_size=30)


print(model.evaluate(x_test, y_test))

predict = model.predict(x_test)

print(predict)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0.4023219645023346, 0.8352699875831604]
[[0.2195935 ]
 [0.17201257]
 [0.10894877]
 ...
 [0.06349343]
 [0.04623875]
 [0.23063058]]
