# 당뇨와 고혈압 질병 예측
- BTH_G : 연령(그룹)
- SBP : 수축기혈압
- DBP : 이완기혈압
- FBS : 공복혈당
- SEX : 성별(남성:1, 여성:2)
- DIS : 고혈압/당뇨병 진료여부
  - 고혈압/당뇨병 진료내역 있음: 1
  - 고혈압 진료내역 있음: 2
  - 당뇨병 진료내역 있음: 3
  - 고혈압/당뇨병 진료내역 없음: 4
- BMI : 체질량지수

# 데이터 재가공

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# 레티나 설정을 해주면 글씨가 좀 더 선명하게 보입니다.
# 폰트의 주변이 흐릿하게 보이는 것을 방지합니다.
%config InlineBackend.figure_format = 'retina'

## 데이터 불러오기

In [3]:
import os

# 다운로드 받은 파일을 판다스의 read_csv 를 통해 읽어옵니다.
# 파일을 읽어온 후 shape 로 행과 열의 수를 출력합니다.
csv_path = '/content/drive/MyDrive/health/data/data_2.csv'
df2 = pd.read_csv(csv_path, encoding='cp949')

df = df2.copy()
pd.set_option('display.max_columns', None) # 컬럼 수가 많으므로 요약되어 표시되지 않게 합니다.
# sample, head, tail : 데이터 미리보기
df # 데이터를 확인합니다.

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI
0,1,1,116,78,94,4,16.6
1,1,1,100,60,79,4,22.3
2,1,1,100,60,87,4,21.9
3,1,1,111,70,72,4,20.2
4,1,1,120,80,98,4,20.0
...,...,...,...,...,...,...,...
999995,2,27,120,70,81,2,23.1
999996,2,27,110,70,104,2,27.2
999997,2,27,115,53,110,1,25.2
999998,2,27,120,70,90,2,19.7


In [4]:
# # dtypes를 통해 데이터 형식만 출력
# df.dtypes

# info를 통해 데이터의 크기, 형식, 메모리 사용량 등을 봅니다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   SEX     1000000 non-null  int64  
 1   BTH_G   1000000 non-null  int64  
 2   SBP     1000000 non-null  int64  
 3   DBP     1000000 non-null  int64  
 4   FBS     1000000 non-null  int64  
 5   DIS     1000000 non-null  int64  
 6   BMI     1000000 non-null  float64
dtypes: float64(1), int64(6)
memory usage: 53.4 MB


### 결측치 확인

In [5]:
# Null 값 여부 확인
df.isnull().sum()

SEX      0
BTH_G    0
SBP      0
DBP      0
FBS      0
DIS      0
BMI      0
dtype: int64

### 기초 통계 분석

In [6]:
# 기초 통계량을 살펴봅니다.
df.describe()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,1.489773,13.905268,121.871763,75.787874,98.864428,3.47104,23.804029
std,0.499896,7.006442,14.561706,9.793411,22.9813,0.946151,3.297287
min,1.0,1.0,82.0,50.0,60.0,1.0,14.8
25%,1.0,9.0,110.0,70.0,87.0,3.0,21.5
50%,1.0,14.0,120.0,76.0,94.0,4.0,23.6
75%,2.0,19.0,130.0,80.0,104.0,4.0,25.8
max,2.0,27.0,190.0,120.0,358.0,4.0,40.3


In [7]:
df.head()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI
0,1,1,116,78,94,4,16.6
1,1,1,100,60,79,4,22.3
2,1,1,100,60,87,4,21.9
3,1,1,111,70,72,4,20.2
4,1,1,120,80,98,4,20.0


In [8]:
df.tail()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI
999995,2,27,120,70,81,2,23.1
999996,2,27,110,70,104,2,27.2
999997,2,27,115,53,110,1,25.2
999998,2,27,120,70,90,2,19.7
999999,2,27,116,73,92,4,17.6


In [9]:
# 1,2로 되어있는 SEX(성별) 0,1 로 변경
df['SEX'] = np.where(df['SEX'] == 1, 0, 1)

In [10]:
def B_pressure (DIS) :
  if 0 <= DIS <= 3 :
    return "1"
  elif DIS == 4 :
    return "0"

def B_sugar (DIS) :
  if DIS == 1 :
    return "1"
  elif DIS == 2 or DIS == 4 :
    return "0"

df['B_pressure']=df['DIS'].apply(B_pressure)
df['B_sugar']=df['DIS'].apply(B_sugar)

In [11]:
df.head()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI,B_pressure,B_sugar
0,0,1,116,78,94,4,16.6,0,0
1,0,1,100,60,79,4,22.3,0,0
2,0,1,100,60,87,4,21.9,0,0
3,0,1,111,70,72,4,20.2,0,0
4,0,1,120,80,98,4,20.0,0,0


In [12]:
df.tail()

Unnamed: 0,SEX,BTH_G,SBP,DBP,FBS,DIS,BMI,B_pressure,B_sugar
999995,1,27,120,70,81,2,23.1,1,0
999996,1,27,110,70,104,2,27.2,1,0
999997,1,27,115,53,110,1,25.2,1,1
999998,1,27,120,70,90,2,19.7,1,0
999999,1,27,116,73,92,4,17.6,0,0


In [13]:
df.to_csv('/content/drive/MyDrive/health/data/data_dis01.csv',index=False)