# 건강검진정보 데이터를 통한 성별 및 연령대 분류

1. import & Data Check
2. EDA
3. Feature Engineering 
4. Modeling

- train : 가입자일련번호 끝자리가 5가 아닌 데이터
- test : 가입자일년번호 끝자리가 5인 데이터(성별코드, 연령대코드 없앨 것)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import Counter

plt.style.use('seaborn')
sns.set(font_scale=1.5)

import missingno as msno

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
os.name
plt.rc('font', family = 'Malgun Gothic')
plt.rc('axes', unicode_minus='False')

## 1. import & Data Check

In [4]:
os.listdir('dataset/medical-checkup_2017')

['medical-checkup-2017.csv', 'test.csv', 'train.csv']

In [5]:
data = pd.read_csv('dataset/medical-checkup_2017/medical-checkup-2017.csv')

In [6]:
data 
# 100만 개의 데이터, 34개의 변수로 구성

Unnamed: 0,기준년도,가입자일련번호,성별코드,연령대코드(5세단위),시도코드,신장(5Cm단위),체중(5Kg단위),허리둘레,시력(좌),시력(우),...,감마지티피,흡연상태,음주여부,구강검진수검여부,치아우식증유무,결손치유무,치아마모증유무,제3대구치(사랑니)이상,치석,데이터공개일자
0,2017,1,1,8,43,170,75,90.0,1.0,1.0,...,40.0,1.0,Y,Y,,,,,Y,2018-11-26
1,2017,2,1,7,11,180,80,89.0,0.9,1.2,...,27.0,3.0,N,Y,,,,,,2018-11-26
2,2017,3,1,9,41,165,75,91.0,1.2,1.5,...,68.0,1.0,N,N,,,,,,2018-11-26
3,2017,4,1,11,48,175,80,91.0,1.5,1.2,...,18.0,1.0,N,Y,,,,,N,2018-11-26
4,2017,5,1,11,30,165,60,80.0,1.0,1.2,...,25.0,1.0,N,Y,,,,,N,2018-11-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2017,999996,1,10,48,175,80,92.1,1.5,1.5,...,27.0,1.0,N,N,,,,,,2018-11-26
999996,2017,999997,1,8,41,170,75,86.0,1.0,1.5,...,15.0,1.0,N,N,,,,,,2018-11-26
999997,2017,999998,2,9,26,155,50,68.0,1.0,0.7,...,17.0,3.0,Y,N,,,,,,2018-11-26
999998,2017,999999,1,6,29,175,60,72.0,1.5,1.0,...,17.0,1.0,N,N,,,,,,2018-11-26


In [9]:
print(data.columns)
data.info()

Index(['기준년도', '가입자일련번호', '성별코드', '연령대코드(5세단위)', '시도코드', '신장(5Cm단위)',
       '체중(5Kg단위)', '허리둘레', '시력(좌)', '시력(우)', '청력(좌)', '청력(우)', '수축기혈압',
       '이완기혈압', '식전혈당(공복혈당)', '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤',
       '혈색소', '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피', '흡연상태',
       '음주여부', '구강검진수검여부', '치아우식증유무', '결손치유무', '치아마모증유무', '제3대구치(사랑니)이상', '치석',
       '데이터공개일자'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 34 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   기준년도          1000000 non-null  int64  
 1   가입자일련번호       1000000 non-null  int64  
 2   성별코드          1000000 non-null  int64  
 3   연령대코드(5세단위)   1000000 non-null  int64  
 4   시도코드          1000000 non-null  int64  
 5   신장(5Cm단위)     1000000 non-null  int64  
 6   체중(5Kg단위)     1000000 non-null  int64  
 7   허리둘레          999734 non-null   float64
 8   시력(좌)         99

### 1) 데이터 확인 및 분리

train과 test 데이터 분리하기 
1) 가입자 일련번호로 분리 수행
- 데이터 타입 수정
- 5로 안끝나면 train
- 5로 끝나면 test

train 데이터에 대해서만 수행
1) 연령대 데이터 10,20,30,40.. 으로 변경 처리
- 연도에 따라 다르게 수행할 것!


In [10]:
# 가입자일련번호 데이터 타입 변환

df = data
df = df.astype({'가입자일련번호':'str'})

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 34 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   기준년도          1000000 non-null  int64  
 1   가입자일련번호       1000000 non-null  object 
 2   성별코드          1000000 non-null  int64  
 3   연령대코드(5세단위)   1000000 non-null  int64  
 4   시도코드          1000000 non-null  int64  
 5   신장(5Cm단위)     1000000 non-null  int64  
 6   체중(5Kg단위)     1000000 non-null  int64  
 7   허리둘레          999734 non-null   float64
 8   시력(좌)         999817 non-null   float64
 9   시력(우)         999811 non-null   float64
 10  청력(좌)         999842 non-null   float64
 11  청력(우)         999844 non-null   float64
 12  수축기혈압         999981 non-null   float64
 13  이완기혈압         999982 non-null   float64
 14  식전혈당(공복혈당)    999958 non-null   float64
 15  총콜레스테롤        999957 non-null   float64
 16  트리글리세라이드      999955 non-null   float64
 17  HDL콜레스테롤      999956 non-nul

In [12]:
df_train = df[-df['가입자일련번호'].str.endswith('5')]
df_test = df[df['가입자일련번호'].str.endswith('5')]

In [14]:
df_train.shape, df_test.shape

((900000, 34), (100000, 34))

test 데이터 target 변수 삭제

In [15]:
df_test.columns

Index(['기준년도', '가입자일련번호', '성별코드', '연령대코드(5세단위)', '시도코드', '신장(5Cm단위)',
       '체중(5Kg단위)', '허리둘레', '시력(좌)', '시력(우)', '청력(좌)', '청력(우)', '수축기혈압',
       '이완기혈압', '식전혈당(공복혈당)', '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤',
       '혈색소', '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피', '흡연상태',
       '음주여부', '구강검진수검여부', '치아우식증유무', '결손치유무', '치아마모증유무', '제3대구치(사랑니)이상', '치석',
       '데이터공개일자'],
      dtype='object')

In [16]:
df_test.drop(['성별코드', '연령대코드(5세단위)'], axis = 1, inplace = True) 

In [17]:
df_test.columns

Index(['기준년도', '가입자일련번호', '시도코드', '신장(5Cm단위)', '체중(5Kg단위)', '허리둘레', '시력(좌)',
       '시력(우)', '청력(좌)', '청력(우)', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '총콜레스테롤',
       '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤', '혈색소', '요단백', '혈청크레아티닌',
       '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피', '흡연상태', '음주여부', '구강검진수검여부',
       '치아우식증유무', '결손치유무', '치아마모증유무', '제3대구치(사랑니)이상', '치석', '데이터공개일자'],
      dtype='object')

In [None]:
df_train['연령대코드(5세단위)'].value_counts() # 5부터 18까지

https://velog.io/@dlskawns/Dataframe-%EB%82%B4-%ED%8A%B9%EC%A0%95Column-row%EC%9D%98-%EC%84%A0%ED%83%9D-%EC%A1%B0%EA%B1%B4%EB%B6%80-%EC%84%A0%ED%83%9D-%EB%B3%80%EA%B2%BD%ED%95%98%EA%B8%B0

연령대코드 이름 변경하고, 나이대로 나눠서 저장하기

In [19]:
df_train.rename(columns = {'연령대코드(5세단위)':'연령대코드'}, inplace=True)

In [20]:
k = 20
for i in range(5, 19, 2):
    df_train.loc[(df_train['연령대코드']== i) |(df_train['연령대코드'] == i+1), '연령대코드'] = str(k)
    k += 10
    
df_train['연령대코드'].value_counts()

40    225596
50    218163
30    147255
60    144280
20     78356
70     69370
80     16980
Name: 연령대코드, dtype: int64

In [None]:
# df_train[df_train['연령대코드']== '80'] = '80대이상'
# 이거 하면 dtype이 모두 object로 변경됨ㅜ

In [21]:
df_train.shape, df_test.shape

((900000, 34), (100000, 32))

변수 이름 변경

In [22]:
df_train.rename(columns={'기준년도' :'year', '가입자일련번호':'id', '성별코드':'target_sex','연령대코드':'target_age', '시도코드':'sido', 
                          '신장(5Cm단위)':'height', '체중(5Kg단위)':'weight', '허리둘레' : 'waist',
                          '시력(좌)':'sight_L', '시력(우)':'sight_R', '청력(좌)':'hear_L', '청력(우)':'hear_R',
                          '수축기혈압':'bp_high', '이완기혈압':'bp_low', '식전혈당(공복혈당)':'blds',
                          '총콜레스테롤':'tot_chole', '트리글리세라이드':'triglyceride', 'HDL콜레스테롤':'hdl_chole', 
                          'LDL콜레스테롤':'ldl_chole', '혈색소':'hmg', '요단백':'olig_prote_cd',
                          '혈청크레아티닌':'creatinine','(혈청지오티)AST':'sgot_ast', '(혈청지오티)ALT':'sgot_alt', '감마지티피':'gamma_gtp', 
                          '흡연상태':'smk_type', '음주여부':'drk_yn', '구강검진수검여부':'hchk_yn',
                          '치아우식증유무':'crs_yn', '결손치유무':'tth_yn', '치아마모증유무':'odt_yn',
                          '제3대구치(사랑니)이상':'wsdm_dis_yn', '치석':'ttr_yn', '데이터공개일자':'data_dt'}, inplace = True)

In [23]:
df_test.rename(columns={'기준년도' :'year', '가입자일련번호':'id', '시도코드':'sido', 
                          '신장(5Cm단위)':'height', '체중(5Kg단위)':'weight', '허리둘레' : 'waist',
                          '시력(좌)':'sight_L', '시력(우)':'sight_R', '청력(좌)':'hear_L', '청력(우)':'hear_R',
                          '수축기혈압':'bp_high', '이완기혈압':'bp_low', '식전혈당(공복혈당)':'blds',
                          '총콜레스테롤':'tot_chole', '트리글리세라이드':'triglyceride', 'HDL콜레스테롤':'hdl_chole', 
                          'LDL콜레스테롤':'ldl_chole', '혈색소':'hmg', '요단백':'olig_prote_cd',
                          '혈청크레아티닌':'creatinine','(혈청지오티)AST':'sgot_ast', '(혈청지오티)ALT':'sgot_alt', '감마지티피':'gamma_gtp', 
                          '흡연상태':'smk_type', '음주여부':'drk_yn', '구강검진수검여부':'hchk_yn',
                          '치아우식증유무':'crs_yn', '결손치유무':'tth_yn', '치아마모증유무':'odt_yn',
                          '제3대구치(사랑니)이상':'wsdm_dis_yn', '치석':'ttr_yn', '데이터공개일자':'data_dt'}, inplace = True)

In [24]:
print(df_train.columns)
print(df_test.columns)

Index(['year', 'id', 'target_sex', 'target_age', 'sido', 'height', 'weight',
       'waist', 'sight_L', 'sight_R', 'hear_L', 'hear_R', 'bp_high', 'bp_low',
       'blds', 'tot_chole', 'triglyceride', 'hdl_chole', 'ldl_chole', 'hmg',
       'olig_prote_cd', 'creatinine', 'sgot_ast', 'sgot_alt', 'gamma_gtp',
       'smk_type', 'drk_yn', 'hchk_yn', 'crs_yn', 'tth_yn', 'odt_yn',
       'wsdm_dis_yn', 'ttr_yn', 'data_dt'],
      dtype='object')
Index(['year', 'id', 'sido', 'height', 'weight', 'waist', 'sight_L', 'sight_R',
       'hear_L', 'hear_R', 'bp_high', 'bp_low', 'blds', 'tot_chole',
       'triglyceride', 'hdl_chole', 'ldl_chole', 'hmg', 'olig_prote_cd',
       'creatinine', 'sgot_ast', 'sgot_alt', 'gamma_gtp', 'smk_type', 'drk_yn',
       'hchk_yn', 'crs_yn', 'tth_yn', 'odt_yn', 'wsdm_dis_yn', 'ttr_yn',
       'data_dt'],
      dtype='object')


In [25]:
# data_dt는 의미가 없어서 drop하겠음
df_train.drop('data_dt', axis=1, inplace=True)
df_test.drop('data_dt', axis=1, inplace=True)

In [26]:
df_train.shape, df_test.shape

((900000, 33), (100000, 31))

### 2) 결측치 확인

In [27]:
for col in df_train.columns:
    msperc = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msperc)

column:       year	 Percent of NaN value: 0.00%
column:         id	 Percent of NaN value: 0.00%
column: target_sex	 Percent of NaN value: 0.00%
column: target_age	 Percent of NaN value: 0.00%
column:       sido	 Percent of NaN value: 0.00%
column:     height	 Percent of NaN value: 0.00%
column:     weight	 Percent of NaN value: 0.00%
column:      waist	 Percent of NaN value: 0.03%
column:    sight_L	 Percent of NaN value: 0.02%
column:    sight_R	 Percent of NaN value: 0.02%
column:     hear_L	 Percent of NaN value: 0.02%
column:     hear_R	 Percent of NaN value: 0.02%
column:    bp_high	 Percent of NaN value: 0.00%
column:     bp_low	 Percent of NaN value: 0.00%
column:       blds	 Percent of NaN value: 0.00%
column:  tot_chole	 Percent of NaN value: 0.00%
column: triglyceride	 Percent of NaN value: 0.00%
column:  hdl_chole	 Percent of NaN value: 0.00%
column:  ldl_chole	 Percent of NaN value: 0.29%
column:        hmg	 Percent of NaN value: 0.01%
column: olig_prote_cd	 Percent of NaN 

In [28]:
for col in df_test.columns:
    msperc = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
    print(msperc)

column:       year	 Percent of NaN value: 0.00%
column:         id	 Percent of NaN value: 0.00%
column:       sido	 Percent of NaN value: 0.00%
column:     height	 Percent of NaN value: 0.00%
column:     weight	 Percent of NaN value: 0.00%
column:      waist	 Percent of NaN value: 0.03%
column:    sight_L	 Percent of NaN value: 0.02%
column:    sight_R	 Percent of NaN value: 0.02%
column:     hear_L	 Percent of NaN value: 0.02%
column:     hear_R	 Percent of NaN value: 0.02%
column:    bp_high	 Percent of NaN value: 0.00%
column:     bp_low	 Percent of NaN value: 0.00%
column:       blds	 Percent of NaN value: 0.00%
column:  tot_chole	 Percent of NaN value: 0.01%
column: triglyceride	 Percent of NaN value: 0.01%
column:  hdl_chole	 Percent of NaN value: 0.01%
column:  ldl_chole	 Percent of NaN value: 0.28%
column:        hmg	 Percent of NaN value: 0.01%
column: olig_prote_cd	 Percent of NaN value: 0.42%
column: creatinine	 Percent of NaN value: 0.00%
column:   sgot_ast	 Percent of NaN 

In [29]:
# crs_yn tth_yn odt_yn wsdm_dis_yn : 100퍼.
# 지움

df_train.drop(['crs_yn', 'tth_yn','odt_yn','wsdm_dis_yn'],axis = 1, inplace=True)
df_test.drop(['crs_yn', 'tth_yn','odt_yn','wsdm_dis_yn'],axis = 1, inplace=True)

In [30]:
df_train.shape, df_test.shape

((900000, 29), (100000, 27))

### 3) 이상치 확인

In [31]:
# 기본적으로 이상치는 numerical 데이터에서 확인될 수 있기 때문에
numerical_features = df_train.dtypes[df_train.dtypes != object].index
print(len(numerical_features), numerical_features)
print('*'*100)
categorical_features = df_train.dtypes[df_train.dtypes == object].index
print(len(categorical_features), categorical_features)

# 29개 맞음

24 Index(['year', 'target_sex', 'sido', 'height', 'weight', 'waist', 'sight_L',
       'sight_R', 'hear_L', 'hear_R', 'bp_high', 'bp_low', 'blds', 'tot_chole',
       'triglyceride', 'hdl_chole', 'ldl_chole', 'hmg', 'olig_prote_cd',
       'creatinine', 'sgot_ast', 'sgot_alt', 'gamma_gtp', 'smk_type'],
      dtype='object')
****************************************************************************************************
5 Index(['id', 'target_age', 'drk_yn', 'hchk_yn', 'ttr_yn'], dtype='object')


In [32]:
# 여기서 n이 뭘뜻하지? - 확인해둘 것..........
# 그리고 왜 test데이터에 대해서는 확인 안하지?


def detect_outliers(df, n, features):
    outlier_indices = []
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        
        outlier_step = 1.5*IQR
        
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k,v in outlier_indices.items() if v> n)
    
    return multiple_outliers

Outlier_to_drop = detect_outliers(df_train, 2,['height', 'weight', 'waist', 'sight_L',
       'sight_R','bp_high', 'bp_low', 'blds', 'tot_chole',
       'triglyceride', 'hdl_chole', 'ldl_chole', 'hmg', 'olig_prote_cd',
       'creatinine', 'sgot_ast', 'sgot_alt', 'gamma_gtp', 'smk_type'])

In [33]:
df_train.loc[Outlier_to_drop] # 0개

Unnamed: 0,year,id,target_sex,target_age,sido,height,weight,waist,sight_L,sight_R,...,hmg,olig_prote_cd,creatinine,sgot_ast,sgot_alt,gamma_gtp,smk_type,drk_yn,hchk_yn,ttr_yn


In [34]:
df_train.shape

(900000, 29)

In [None]:
# 여기까지 데이터 저장 
df_train.to_csv('dataset/medical-checkup_2017/train.csv',index=False,encoding='CP949')
df_test.to_csv('dataset/medical-checkup_2017/test.csv',index=False,encoding='CP949')

### 4) 비대칭성 확인

이번 target 변수는 범주형 자료임
이 데이터의 비대칭성도 확인해봐야하는 걸까?

일단은 수치형 변수들에 대한 비대칭성을 확인해보자


In [35]:
for col in numerical_features:
    print('{:15}'.format(col),
          'Skewness : {:05.2f}'.format(df_train[col].skew()),
         '  ', 
         'Kurtosis : {:06.2f}'.format(df_train[col].kurt()),)

year            Skewness : 00.00    Kurtosis : 000.00
target_sex      Skewness : 00.12    Kurtosis : -01.99
sido            Skewness : -0.72    Kurtosis : -00.82
height          Skewness : -0.02    Kurtosis : -00.53
weight          Skewness : 00.58    Kurtosis : 000.36
waist           Skewness : 26.32    Kurtosis : 2041.14
sight_L         Skewness : 09.98    Kurtosis : 143.73
sight_R         Skewness : 10.02    Kurtosis : 144.72
hear_L          Skewness : 05.34    Kurtosis : 026.50
hear_R          Skewness : 05.45    Kurtosis : 027.66
bp_high         Skewness : 00.48    Kurtosis : 000.99
bp_low          Skewness : 00.40    Kurtosis : 000.89
blds            Skewness : 04.60    Kurtosis : 040.22
tot_chole       Skewness : 01.61    Kurtosis : 052.07
triglyceride    Skewness : 06.83    Kurtosis : 184.80
hdl_chole       Skewness : 153.72    Kurtosis : 59659.65
ldl_chole       Skewness : 05.45    Kurtosis : 516.24
hmg             Skewness : -0.39    Kurtosis : 000.71
olig_prote_cd   Skewness

## EDA

In [None]:
df_train = pd.read_csv('dataset/medical-checkup_2017/train.csv')
df_test = pd.read_csv('dataset/medical-checkup_2017/test.csv')

In [None]:
df_train.shape, df_test.shape

### 1) numerical feature
heat map, 상관관계 파악
데이터 형태 파악  

1) target_sex  

2) target_age

In [None]:
numerical_features = df_train[['']]

### 2) categorical feature
박스플랏, count 그려보고 치우친 데이터나 결측치 어떻게 처리할 지 파악
데이터에 대한 정보 전달