# 당뇨와 고혈압 질병 예측
- BTH_G : 연령(그룹)
- SBP : 수축기혈압
- DBP : 이완기혈압
- FBS : 공복혈당
- SEX : 성별(남성:1, 여성:2)
- DIS : 고혈압/당뇨병 진료여부
  - 고혈압/당뇨병 진료내역 있음: 1
  - 고혈압 진료내역 있음: 2
  - 당뇨병 진료내역 있음: 3
  - 고혈압/당뇨병 진료내역 없음: 4
- BMI : 체질량지수

# 데이터 재가공

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from pandas.core.common import random_state
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

## 데이터 불러오기

In [3]:
import os

# 다운로드 받은 파일을 판다스의 read_csv 를 통해 읽어옵니다.
# 파일을 읽어온 후 shape 로 행과 열의 수를 출력합니다.
csv_path = '/content/drive/MyDrive/health/data/data_over_sampling(1:1:1).csv'
df2 = pd.read_csv(csv_path, encoding='cp949')

df = df2.copy()
pd.set_option('display.max_columns', None) # 컬럼 수가 많으므로 요약되어 표시되지 않게 합니다.
# sample, head, tail : 데이터 미리보기
df # 데이터를 확인합니다.

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,Dis01,DIS
0,0,1,90,50,86,20.1,1,2
1,0,1,120,80,90,27.0,1,3
2,0,1,100,60,112,31.8,1,2
3,0,1,170,120,86,23.3,1,2
4,0,1,135,80,104,35.0,1,2
...,...,...,...,...,...,...,...,...
1229135,1,27,130,90,86,27.0,0,4
1229136,1,27,110,70,89,23.6,0,4
1229137,1,27,110,70,122,26.3,0,4
1229138,1,27,130,80,82,21.6,0,4


In [4]:
# df = df[['SEX', 'BTH_G', 'SBP', 'DBP','FBS','BMI','DIS']]

---

In [5]:
df_1 = df[df['Dis01'] == 1]
df_1
# temp = df_1

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,Dis01,DIS
0,0,1,90,50,86,20.1,1,2
1,0,1,120,80,90,27.0,1,3
2,0,1,100,60,112,31.8,1,2
3,0,1,170,120,86,23.3,1,2
4,0,1,135,80,104,35.0,1,2
...,...,...,...,...,...,...,...,...
488473,0,19,137,83,112,24.7,1,3
488474,1,15,122,70,123,30.8,1,3
488475,1,16,134,70,106,24.1,1,3
488476,1,20,115,57,132,24.3,1,3


In [6]:
# feature와 target 분리
y = np.array(df['Dis01'])
X = df.drop(['Dis01'], axis=1)
X_train=np.array(X)
print(X,y)

X_res = df_1
y_res = df[df['Dis01'] == 0]

# # 데이터 정규화
# scaler=MinMaxScaler()
# x_train=scaler.fit_transform(x_train)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

         SEX  BTH_G  SBP  DBP  FBS   BMI  DIS
0          0      1   90   50   86  20.1    2
1          0      1  120   80   90  27.0    3
2          0      1  100   60  112  31.8    2
3          0      1  170  120   86  23.3    2
4          0      1  135   80  104  35.0    2
...      ...    ...  ...  ...  ...   ...  ...
1229135    1     27  130   90   86  27.0    4
1229136    1     27  110   70   89  23.6    4
1229137    1     27  110   70  122  26.3    4
1229138    1     27  130   80   82  21.6    4
1229139    1     27  116   73   92  17.6    4

[1229140 rows x 7 columns] [1 1 1 ... 0 0 0]


In [7]:
from imblearn.over_sampling import RandomOverSampler, SMOTE # BorderlineSMOTE, ADASTN
from collections import Counter

# ROS = Random Over Sampler
ros = RandomOverSampler(random_state = 42)
X_res, y_res = ros.fit_resample(X, y)

print('Rasampled dataset shape %s' % Counter(y))
print('Rasampled dataset shape %s' % Counter(y_res))

Rasampled dataset shape Counter({0: 740662, 1: 488478})
Rasampled dataset shape Counter({1: 740662, 0: 740662})


In [8]:
X_res

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS
0,0,1,90,50,86,20.1,2
1,0,1,120,80,90,27.0,3
2,0,1,100,60,112,31.8,2
3,0,1,170,120,86,23.3,2
4,0,1,135,80,104,35.0,2
...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3
1481320,1,18,140,100,120,28.8,2
1481321,0,12,100,60,101,23.0,3
1481322,1,20,134,72,168,23.1,3


In [9]:
len(y_res)

1481324

In [10]:
X_res['Dis'] = y_res

In [11]:
X_res

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis
0,0,1,90,50,86,20.1,2,1
1,0,1,120,80,90,27.0,3,1
2,0,1,100,60,112,31.8,2,1
3,0,1,170,120,86,23.3,2,1
4,0,1,135,80,104,35.0,2,1
...,...,...,...,...,...,...,...,...
1481319,1,21,130,90,185,26.3,3,1
1481320,1,18,140,100,120,28.8,2,1
1481321,0,12,100,60,101,23.0,3,1
1481322,1,20,134,72,168,23.1,3,1


In [12]:
X_res.to_csv('/content/drive/MyDrive/health/data/data(over_dis_vs_nor).csv',index=False)