In [1]:
# base tool
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows',100)
import numpy as np
from sklearn.model_selection import train_test_split
import copy

import warnings
warnings.filterwarnings('ignore')

#visualization
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/gulim.ttc"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)
import seaborn as sns    

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"


In [2]:
def summary(df, pred=None):
    obs = df.shape[0]
    Types = df.dtypes
    Counts = df.apply(lambda x: x.count())
    Min = df.min()
    Max = df.max()
    Uniques = df.apply(lambda x: x.unique().shape[0])
    Nulls = df.apply(lambda x: x.isnull().sum())
    print('Data shape:', df.shape)

    if pred is None:
        cols = ['Types', 'Counts', 'Uniques', 'Nulls', 'Min', 'Max']
        st = pd.concat([Types, Counts, Uniques, Nulls, Min, Max], axis = 1, sort=True)

    st.columns = cols
    print('___________________________\nData Types:')
    print(st.Types.value_counts())
    print('___________________________')
    return st

In [7]:
raw_data = pd.read_csv('./국민건강보험공단_건강검진정보_20211229.CSV',encoding='cp949')
raw_data= pd.DataFrame(raw_data)

In [9]:
raw_data.shape

(1000000, 31)

In [12]:
raw_data  = raw_data.iloc[:,3:-1]
raw_data.head(10)

Unnamed: 0,성별코드,연령대 코드(5세단위),신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),청력(좌),청력(우),수축기 혈압,이완기 혈압,식전혈당(공복혈당),총 콜레스테롤,트리글리세라이드,HDL 콜레스테롤,LDL 콜레스테롤,혈색소,요단백,혈청크레아티닌,(혈청지오티)AST,(혈청지오티)ALT,감마 지티피,흡연상태,음주여부,구강검진 수검여부,치아우식증유무,치석
0,1,9,165,60,72.1,1.2,1.5,1.0,1.0,127.0,79.0,90.0,188.0,58.0,58.0,118.0,15.0,1.0,1.1,21.0,27.0,21.0,1.0,0.0,0,,
1,2,13,150,65,81.0,0.8,0.8,1.0,1.0,110.0,73.0,87.0,,,,,12.7,1.0,0.5,18.0,15.0,15.0,1.0,0.0,0,,
2,2,12,155,55,70.0,0.6,0.7,1.0,1.0,123.0,80.0,102.0,,,,,12.8,1.0,0.7,27.0,25.0,7.0,1.0,0.0,0,,
3,1,13,160,70,90.8,1.0,1.0,1.0,2.0,134.0,84.0,146.0,,,,,16.4,1.0,1.2,65.0,97.0,72.0,1.0,0.0,1,0.0,0.0
4,2,12,155,50,75.2,1.5,1.2,1.0,1.0,144.0,89.0,110.0,220.0,171.0,53.0,133.0,12.4,1.0,0.7,18.0,17.0,14.0,1.0,0.0,0,,
5,1,9,185,85,94.0,1.2,1.2,1.0,1.0,114.0,72.0,86.0,234.0,183.0,50.0,147.0,16.4,1.0,1.1,25.0,32.0,26.0,3.0,1.0,0,,
6,1,9,165,80,93.0,0.8,0.7,1.0,2.0,112.0,73.0,250.0,119.0,265.0,26.0,40.0,15.7,1.0,0.7,18.0,20.0,35.0,3.0,1.0,1,0.0,1.0
7,1,13,160,65,92.0,0.5,0.5,1.0,1.0,131.0,79.0,93.0,,,,,15.9,1.0,1.2,18.0,17.0,19.0,3.0,0.0,1,0.0,2.0
8,2,17,150,50,82.0,0.4,0.5,2.0,2.0,136.0,65.0,104.0,177.0,61.0,63.0,101.0,13.3,1.0,0.7,42.0,48.0,39.0,1.0,0.0,0,,
9,2,14,150,45,71.2,0.8,0.7,2.0,1.0,124.0,83.0,85.0,,,,,13.2,1.0,0.6,22.0,11.0,10.0,1.0,0.0,0,,


In [13]:
summary(raw_data)

Data shape: (1000000, 27)
___________________________
Data Types:
float64    22
int64       5
Name: Types, dtype: int64
___________________________


Unnamed: 0,Types,Counts,Uniques,Nulls,Min,Max
(혈청지오티)ALT,float64,992398,638,7602,1.0,5990.0
(혈청지오티)AST,float64,992399,581,7601,1.0,8712.0
HDL 콜레스테롤,float64,402315,341,597685,1.0,960.0
LDL 콜레스테롤,float64,394471,363,605529,1.0,2395.0
감마 지티피,float64,992397,972,7603,1.0,2630.0
구강검진 수검여부,int64,1000000,2,0,0.0,1.0
성별코드,int64,1000000,2,0,1.0,2.0
수축기 혈압,float64,992468,175,7532,64.0,260.0
시력(우),float64,999748,25,252,0.1,9.9
시력(좌),float64,999743,25,257,0.1,9.9


In [15]:
raw_data['요단백'].value_counts()

1.0    914825
2.0     39974
3.0     22219
4.0      7652
5.0      2465
6.0       724
Name: 요단백, dtype: int64