# 당뇨와 고혈압 질병 예측
- BTH_G : 연령(그룹)
- SBP : 수축기혈압
- DBP : 이완기혈압
- FBS : 공복혈당
- SEX : 성별(남성:1, 여성:2)
- DIS : 고혈압/당뇨병 진료여부
  - 고혈압/당뇨병 진료내역 있음: 1
  - 고혈압 진료내역 있음: 2
  - 당뇨병 진료내역 있음: 3
  - 고혈압/당뇨병 진료내역 없음: 4
- BMI : 체질량지수

## 정상 vs 비정상
- 4(정상) : 0
- 1 / 2 / 3(비정상) : 1

# 데이터 재가공

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from pandas.core.common import random_state
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification
from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

## 데이터 불러오기

In [None]:
import os

# 다운로드 받은 파일을 판다스의 read_csv 를 통해 읽어옵니다.
# 파일을 읽어온 후 shape 로 행과 열의 수를 출력합니다.
csv_path = '/content/drive/MyDrive/health/data/data_2.csv'
df2 = pd.read_csv(csv_path, encoding='cp949')

df = df2.copy()
pd.set_option('display.max_columns', None) # 컬럼 수가 많으므로 요약되어 표시되지 않게 합니다.
# sample, head, tail : 데이터 미리보기
df # 데이터를 확인합니다.

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI
0,1,1,116,78,94,4,16.6
1,1,1,100,60,79,4,22.3
2,1,1,100,60,87,4,21.9
3,1,1,111,70,72,4,20.2
4,1,1,120,80,98,4,20.0
...,...,...,...,...,...,...,...
999995,2,27,120,70,81,2,23.1
999996,2,27,110,70,104,2,27.2
999997,2,27,115,53,110,1,25.2
999998,2,27,120,70,90,2,19.7


In [None]:
# 1,2로 되어있는 SEX(성별) 0,1 로 변경
df['SEX'] = np.where(df['SEX'] == 1, 0, 1)

In [None]:
df = df[['SEX', 'BTH_G', 'SBP', 'DBP','FBS','BMI','DIS']]

In [None]:
# # dtypes를 통해 데이터 형식만 출력
# df.dtypes

# info를 통해 데이터의 크기, 형식, 메모리 사용량 등을 봅니다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   SEX     1000000 non-null  int64  
 1   BTH_G   1000000 non-null  int64  
 2   SBP     1000000 non-null  int64  
 3   DBP     1000000 non-null  int64  
 4   FBS     1000000 non-null  int64  
 5   BMI     1000000 non-null  float64
 6   DIS     1000000 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 53.4 MB


### 결측치 확인

In [None]:
# Null 값 여부 확인
df.isnull().sum()

SEX      0
BTH_G    0
SBP      0
DBP      0
FBS      0
BMI      0
DIS      0
dtype: int64

### 기초 통계 분석

In [None]:
# 기초 통계량을 살펴봅니다.
df.describe()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,0.489773,13.905268,121.871763,75.787874,98.864428,23.804029,3.47104
std,0.499896,7.006442,14.561706,9.793411,22.9813,3.297287,0.946151
min,0.0,1.0,82.0,50.0,60.0,14.8,1.0
25%,0.0,9.0,110.0,70.0,87.0,21.5,3.0
50%,0.0,14.0,120.0,76.0,94.0,23.6,4.0
75%,1.0,19.0,130.0,80.0,104.0,25.8,4.0
max,1.0,27.0,190.0,120.0,358.0,40.3,4.0


In [None]:
# Dis(Disease) : 질병 유(1 ~ 3)
# Nor(Normal) : 질병 무(정상)

def Dis01 (DIS) :
  if 0 <= DIS <= 3 :
    return "1"
  elif DIS == 4 :
    return "0"

df['Dis01']=df['DIS'].apply(Dis01) # 비정상 (1 ~ 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [None]:
df.head()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis01
0,0,1,116,78,94,16.6,4,0
1,0,1,100,60,79,22.3,4,0
2,0,1,100,60,87,21.9,4,0
3,0,1,111,70,72,20.2,4,0
4,0,1,120,80,98,20.0,4,0


In [None]:
df.tail()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,BMI,DIS,Dis01
999995,1,27,120,70,81,23.1,2,1
999996,1,27,110,70,104,27.2,2,1
999997,1,27,115,53,110,25.2,1,1
999998,1,27,120,70,90,19.7,2,1
999999,1,27,116,73,92,17.6,4,0


In [None]:
df.to_csv('/content/drive/MyDrive/health/data/data(0vs1).csv',index=False)