# 당뇨와 고혈압 질병 예측
- BTH_G : 연령(그룹)
- SBP : 수축기혈압
- DBP : 이완기혈압
- FBS : 공복혈당
- SEX : 성별(남성:1, 여성:2)
- DIS : 고혈압/당뇨병 진료여부
  - 고혈압/당뇨병 진료내역 있음: 1
  - 고혈압 진료내역 있음: 2
  - 당뇨병 진료내역 있음: 3
  - 고혈압/당뇨병 진료내역 없음: 4
- BMI : 체질량지수

# 데이터 재가공

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from pandas.core.common import random_state
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

## 데이터 불러오기

In [3]:
import os

# 다운로드 받은 파일을 판다스의 read_csv 를 통해 읽어옵니다.
# 파일을 읽어온 후 shape 로 행과 열의 수를 출력합니다.
csv_path = '/content/drive/MyDrive/health/data/data(0vs1).csv'
df2 = pd.read_csv(csv_path, encoding='cp949')

df = df2.copy()
pd.set_option('display.max_columns', None) # 컬럼 수가 많으므로 요약되어 표시되지 않게 합니다.
# sample, head, tail : 데이터 미리보기
df # 데이터를 확인합니다.

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis01
0,0,1,116,78,94,16.6,4,0
1,0,1,100,60,79,22.3,4,0
2,0,1,100,60,87,21.9,4,0
3,0,1,111,70,72,20.2,4,0
4,0,1,120,80,98,20.0,4,0
...,...,...,...,...,...,...,...,...
999995,1,27,120,70,81,23.1,2,1
999996,1,27,110,70,104,27.2,2,1
999997,1,27,115,53,110,25.2,1,1
999998,1,27,120,70,90,19.7,2,1


In [4]:
# df = df[['SEX', 'BTH_G', 'SBP', 'DBP','FBS','BMI','DIS']]

---

In [5]:
df_1 = df[df['Dis01'] == 1]
df_1
# temp = df_1

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis01
52,0,1,90,50,86,20.1,2,1
97,0,1,120,80,90,27.0,3,1
133,0,1,100,60,112,31.8,2,1
204,0,1,170,120,86,23.3,2,1
255,0,1,135,80,104,35.0,2,1
...,...,...,...,...,...,...,...,...
999993,1,27,126,70,123,25.2,3,1
999995,1,27,120,70,81,23.1,2,1
999996,1,27,110,70,104,27.2,2,1
999997,1,27,115,53,110,25.2,1,1


In [6]:
# feature와 target 분리
y = np.array(df['Dis01'])
X = df.drop(['Dis01'], axis=1)
X_train=np.array(X)
print(X,y)

X_res = df_1
y_res = df[df['Dis01'] == 0]

# # 데이터 정규화
# scaler=MinMaxScaler()
# x_train=scaler.fit_transform(x_train)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

        SEX  BTH_G  SBP  DBP  FBS   BMI  DIS
0         0      1  116   78   94  16.6    4
1         0      1  100   60   79  22.3    4
2         0      1  100   60   87  21.9    4
3         0      1  111   70   72  20.2    4
4         0      1  120   80   98  20.0    4
...     ...    ...  ...  ...  ...   ...  ...
999995    1     27  120   70   81  23.1    2
999996    1     27  110   70  104  27.2    2
999997    1     27  115   53  110  25.2    1
999998    1     27  120   70   90  19.7    2
999999    1     27  116   73   92  17.6    4

[1000000 rows x 7 columns] [0 0 0 ... 1 1 0]


In [7]:
from imblearn.over_sampling import RandomOverSampler, SMOTE # BorderlineSMOTE, ADASTN
from collections import Counter

# ROS = Random Over Sampler
ros = RandomOverSampler(random_state = 42)
X_res, y_res = ros.fit_resample(X, y)

print('Rasampled dataset shape %s' % Counter(y))
print('Rasampled dataset shape %s' % Counter(y_res))

Rasampled dataset shape Counter({0: 740662, 1: 259338})
Rasampled dataset shape Counter({0: 740662, 1: 740662})


In [8]:
X_res

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS
0,0,1,116,78,94,16.6,4
1,0,1,100,60,79,22.3,4
2,0,1,100,60,87,21.9,4
3,0,1,111,70,72,20.2,4
4,0,1,120,80,98,20.0,4
...,...,...,...,...,...,...,...
1481319,1,25,120,80,78,27.9,1
1481320,1,27,150,74,133,32.0,1
1481321,0,25,120,70,118,26.9,3
1481322,1,20,136,68,93,21.5,2


In [9]:
len(y_res)

1481324

In [10]:
X_res['Dis'] = y_res

In [11]:
X_res

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,116,78,94,16.6,4,0
1,0,1,100,60,79,22.3,4,0
2,0,1,100,60,87,21.9,4,0
3,0,1,111,70,72,20.2,4,0
4,0,1,120,80,98,20.0,4,0
...,...,...,...,...,...,...,...,...
1481319,1,25,120,80,78,27.9,1,1
1481320,1,27,150,74,133,32.0,1,1
1481321,0,25,120,70,118,26.9,3,1
1481322,1,20,136,68,93,21.5,2,1


In [12]:
X_res.to_csv('/content/drive/MyDrive/health/data/data_over_sampling.csv',index=False)