## 파일 업로드/불러오기 ##

In [None]:
# 버전 명시

import sys
import pandas as pd
import numpy as np

print("Python version:", sys.version)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)

Python version: 3.9.6 (default, Mar 12 2025, 20:22:46) 
[Clang 17.0.0 (clang-1700.0.13.3)]
pandas version: 2.2.3
numpy version: 1.24.3


In [10]:
import pandas as pd

# 데이터셋 불러오기
try:
    df = pd.read_csv('Health_2023.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('Health_2023.csv', encoding='cp949')

# 불필요한 컬럼 삭제
df = df.drop(['결손치 유무', '치아마모증유무', '제3대구치(사랑니) 이상'], axis=1)

# '총콜레스테롤' 컬럼 기준 결측값 삭제
df = df.dropna(subset=['총콜레스테롤'])

df.to_csv('health_2023_cleaned.csv', index=False)

In [11]:
df = pd.read_csv('health_2023_cleaned.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338606 entries, 0 to 338605
Data columns (total 30 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   기준년도         338606 non-null  int64  
 1   가입자일련번호      338606 non-null  int64  
 2   시도코드         338606 non-null  int64  
 3   성별코드         338606 non-null  int64  
 4   연령대코드(5세단위)  338606 non-null  int64  
 5   신장(5cm단위)    338606 non-null  int64  
 6   체중(5kg단위)    338606 non-null  int64  
 7   허리둘레         338590 non-null  float64
 8   시력(좌)        338545 non-null  float64
 9   시력(우)        338546 non-null  float64
 10  청력(좌)        338559 non-null  float64
 11  청력(우)        338559 non-null  float64
 12  수축기혈압        338604 non-null  float64
 13  이완기혈압        338604 non-null  float64
 14  식전혈당(공복혈당)   338606 non-null  float64
 15  총콜레스테롤       338606 non-null  float64
 16  트리글리세라이드     338605 non-null  float64
 17  HDL콜레스테롤     338605 non-null  float64
 18  LDL콜레스테롤     332753 non-

Unnamed: 0,기준년도,가입자일련번호,시도코드,성별코드,연령대코드(5세단위),신장(5cm단위),체중(5kg단위),허리둘레,시력(좌),시력(우),...,요단백,혈청크레아티닌,혈청지오티(AST),혈청지피티(ALT),감마지티피,흡연상태,음주여부,구강검진수검여부,치아우식증유무,치석
0,2023,34735,46,2,9,155,70,92.0,1.2,1.2,...,1.0,0.5,26.0,24.0,50.0,1.0,1.0,0,,
1,2023,362482,36,2,13,150,65,96.0,1.0,0.8,...,1.0,1.1,22.0,29.0,24.0,1.0,1.0,0,,
2,2023,653166,11,1,13,160,70,85.0,1.0,1.2,...,1.0,0.9,22.0,21.0,27.0,3.0,1.0,1,0.0,0.0
3,2023,722514,49,1,10,165,80,89.0,1.2,1.2,...,1.0,0.7,53.0,69.0,53.0,2.0,1.0,0,,
4,2023,2555336,28,2,13,150,70,97.0,1.5,1.5,...,1.0,0.6,41.0,45.0,61.0,1.0,1.0,0,,


In [12]:
import matplotlib.pyplot as plt

# 한글 폰트 설정 (macOS: AppleGothic)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

# 수치형 컬럼만 선택
numeric_cols = df.columns

# 각 컬럼별로 박스플롯 생성 및 저장
for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    df.boxplot(column=col)
    plt.title(f'Boxplot of {col} (Outlier Visualization)')
    plt.tight_layout()
    plt.savefig(f'boxplot_{col}.png')
    plt.close()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 한글 폰트 설정 (macOS: AppleGothic)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

# 수치형 컬럼만 선택
df_numeric = df.select_dtypes(include=['float64', 'int64'])

# 각 컬럼별로 분포 시각화 및 저장
for col in df_numeric.columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(df_numeric[col].dropna(), kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('빈도')
    plt.tight_layout()
    plt.savefig(f'distribution_{col}.png')
    plt.close()

## 이상치 처리 과정 ##

In [13]:
columns_to_show = ['감마지티피']  # 원하는 컬럼 추가
high_bp_top20 = df.nlargest(50, '감마지티피')[columns_to_show]
print(high_bp_top20)

         감마지티피
17261   9999.0
227406  9999.0
278666  9999.0
22442   2951.0
206432  2700.0
147081  2680.0
269171  2645.0
2472    2525.0
144549  2520.0
104407  2459.0
202447  2349.0
258065  2315.0
288102  2236.0
175799  2202.0
122735  2199.0
53389   2112.0
323011  2100.0
160951  1966.0
318330  1966.0
327664  1943.0
197683  1785.0
92717   1745.0
242273  1662.0
248667  1657.0
39569   1655.0
140663  1638.0
10875   1633.0
239109  1584.0
332510  1584.0
167028  1560.0
328176  1548.0
259054  1532.0
105553  1526.0
24800   1503.0
231561  1490.0
278247  1484.0
49904   1482.0
31817   1479.0
88996   1472.0
252951  1465.0
179654  1458.0
307824  1441.0
82484   1422.0
152716  1395.0
129354  1392.0
307713  1388.0
74921   1383.0
227254  1362.0
49114   1359.0
260621  1355.0


In [14]:
df = df[df['감마지티피'] != 9999.0]

In [None]:
# 감마지티피 컬럼의 IQR 방식 이상치 탐색
col = '감마지티피'
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
print(f"{col} 이상치 개수:", len(outliers))
print(lower_bound, upper_bound)
print(outliers[[col]])


감마지티피 이상치 개수: 30157
-18.5 73.5
        감마지티피
20       99.0
25      198.0
46      127.0
71      142.0
83      151.0
...       ...
338564   79.0
338582   91.0
338594  132.0
338595  129.0
338605  160.0

[30157 rows x 1 columns]


In [None]:
# IQR 방식으로 탐지된 감마지티피 이상치 삭제
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

In [15]:
columns_to_show = ['감마지티피']  # 원하는 컬럼 추가
high_bp_top20 = df.nlargest(50, '감마지티피')[columns_to_show]
print(high_bp_top20)

         감마지티피
22442   2951.0
206432  2700.0
147081  2680.0
269171  2645.0
2472    2525.0
144549  2520.0
104407  2459.0
202447  2349.0
258065  2315.0
288102  2236.0
175799  2202.0
122735  2199.0
53389   2112.0
323011  2100.0
160951  1966.0
318330  1966.0
327664  1943.0
197683  1785.0
92717   1745.0
242273  1662.0
248667  1657.0
39569   1655.0
140663  1638.0
10875   1633.0
239109  1584.0
332510  1584.0
167028  1560.0
328176  1548.0
259054  1532.0
105553  1526.0
24800   1503.0
231561  1490.0
278247  1484.0
49904   1482.0
31817   1479.0
88996   1472.0
252951  1465.0
179654  1458.0
307824  1441.0
82484   1422.0
152716  1395.0
129354  1392.0
307713  1388.0
74921   1383.0
227254  1362.0
49114   1359.0
260621  1355.0
327946  1347.0
236736  1341.0
312431  1339.0


In [18]:
columns_to_show = ['수축기혈압', '이완기혈압']  # 원하는 컬럼 추가
high_bp_top10 = df.nlargest(10, '수축기혈압')[columns_to_show]
print(high_bp_top10)

        수축기혈압  이완기혈압
274421  255.0  159.0
112699  230.0  120.0
132212  230.0  140.0
145685  228.0  137.0
15777   226.0  136.0
29784   226.0  103.0
62115   223.0   94.0
210374  223.0  110.0
238775  223.0  113.0
216800  222.0  126.0


In [19]:
max_idx = df['수축기혈압'].idxmax()
df = df.drop(index=max_idx)

In [20]:
columns_to_show = ['수축기혈압', '이완기혈압']  # 원하는 컬럼 추가
high_bp_top10 = df.nlargest(10, '수축기혈압')[columns_to_show]
print(high_bp_top10)

        수축기혈압  이완기혈압
112699  230.0  120.0
132212  230.0  140.0
145685  228.0  137.0
15777   226.0  136.0
29784   226.0  103.0
62115   223.0   94.0
210374  223.0  110.0
238775  223.0  113.0
216800  222.0  126.0
254854  222.0  138.0


In [22]:
columns_to_show = ['시력(우)', '시력(좌)']  # 원하는 컬럼 추가
high_bp_top20 = df.nlargest(20, '시력(우)')[columns_to_show]
print(high_bp_top20)

      시력(우)  시력(좌)
131     9.9    9.9
992     9.9    0.8
1364    9.9    9.9
1462    9.9    1.0
1849    9.9    0.8
2403    9.9    1.0
2423    9.9    0.8
2633    9.9    0.8
3615    9.9    9.9
4303    9.9    9.9
4731    9.9    9.9
4849    9.9    0.9
5898    9.9    0.7
5941    9.9    1.5
6069    9.9    9.9
6276    9.9    0.5
6581    9.9    9.9
6637    9.9    0.9
6652    9.9    1.5
6888    9.9    0.5


In [None]:
# 시력 9.9값들은 실명으로 간주하여 0으로 변경 (한글 파일 참조)

df['시력(우)'] = df['시력(우)'].replace(9.9, 0)
df['시력(좌)'] = df['시력(좌)'].replace(9.9, 0)

In [26]:
columns_to_show = ['식전혈당(공복혈당)']  # 원하는 컬럼 추가
high_bp_top20 = df.nlargest(20, '식전혈당(공복혈당)')[columns_to_show]
print(high_bp_top20)

        식전혈당(공복혈당)
120923       873.0
27265        853.0
288180       762.0
181047       760.0
54185        676.0
72500        626.0
46789        619.0
225843       599.0
150583       585.0
324556       579.0
20662        530.0
81810        517.0
83205        513.0
319484       511.0
33413        509.0
127100       509.0
181362       500.0
9634         495.0
186827       487.0
33840        477.0
