In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
plt.rc('font', family='AppleGothic')
plt.rcParams["font.size"] = 12.
plt.rcParams['xtick.labelsize'] = 12.
plt.rcParams['ytick.labelsize'] = 12.
plt.rcParams['axes.unicode_minus'] = False

In [3]:
CHART_NAME = 'seabornWelfare'
cnt, PNG, UNDERBAR = 0, '.png', '_'
filename = '../data/welfare_python.csv'

In [4]:
# data load
welfare = pd.read_csv(filename, encoding = 'utf-8')

#@ data preprocessing
# gender
welfare.loc[welfare['gender'] ==1, ['gender']] = '남성'
welfare.loc[welfare['gender'] ==2, ['gender']] = '여성'
print(""); print("<Gender 변경>")
print(welfare)


# age 추가
thisyear = 2021
welfare['age'] = thisyear - welfare['birth'] +1
print(""); print("<age 컬럼 추가>")
print(welfare)


# marriage
def setMarriage(x):
    if x ==1 :
        return '결혼'
    elif x ==3 :
        return '이혼'
    else: # 결측치
        return '무응답'

welfare['marriage'] = welfare['marriage'].apply(setMarriage)


# income
print('\n# 월급 결측치 개수 구하기 before')
print(sum(welfare['income'].isnull()))
welfare.loc[welfare['income'].isnull(), 'income'] = welfare['income'].mean()
print('\n# 월급 결측치 개수 구하기 after')
print(sum(welfare['income'].isnull()))


# religion
def setReligion_txt(x):
    if int(x) == 1:
        return '있음'
    else:
        return '없음'
print("welfare['religion'].unique() : ", welfare['religion'].unique())
welfare['religion'] = welfare['religion'].apply(setReligion_txt)
print(""); print("<religion 변경>")
print(welfare)


# code_job
job_file = '../data/welfare_job.csv'
jobframe = pd.read_csv(job_file, encoding = 'cp949')

print("welfare['code_job'].unique()", welfare['code_job'].unique())
print('\n# merge()함수와 left_on, rigon_on 사용하기')
welfare = pd.merge(welfare, jobframe, left_on = 'code_job', right_on = 'code_job')
print(welfare)


# code_religion
print("welfare['code_religion'].unique()", welfare['code_religion'].unique())
def setReligion_txt(x):
    if int(x) ==1:
        return '서울'
    elif int(x) == 2:
        return '수도권'
    elif int(x) == 3:
        return '부산/경남/울산'
    elif int(x) ==4:
        return '대구/경북'
    elif int(x) ==5:
        return '대전/충남'
    elif int(x) == 6:
        return '강원/충북'
    elif int(x) == 7:
        return '광주/전남/전북/제주도'
welfare['code_religion'] = welfare['code_religion'].apply(setReligion_txt)


# ageg 추가 - 연령대
## 30대 미만 -> 청년, 30대 이상 -> 중년, 60세 이상 -> 노년
def newAge(x):
    if x<30:
        return '청년'
    elif x>=30 and x<60:
        return '중년'
    else:
        return '노년'
welfare['ageg'] = welfare['age'].apply(newAge)

print(welfare[['age', 'ageg']].head())


# column들을 한글 이름으로 변경
col_mapping = {'gender' : '성별', 'birth':'생일', 'marriage':'결혼 유무', 
               'religion':'종교 유무', 'code_job' : '직업 코드', 'income' : '소득', 
               'code_religion': '지역구', 'age' : '나이', 'job' : '직업', 'ageg' : '연령대'}
welfare = welfare.rename(columns = col_mapping)

# csv 파일로 저장
welfare.to_csv('welfareClean.csv', index = False, encoding='cp949')


<Gender 변경>
      gender  birth  marriage  religion  code_job  income  code_religion
0         여성   1936         2         2       NaN     NaN              1
1         여성   1945         2         2       NaN     NaN              1
2         남성   1948         2         2     942.0   120.0              1
3         남성   1942         3         1     762.0   200.0              1
4         여성   1923         2         1       NaN     NaN              1
...      ...    ...       ...       ...       ...     ...            ...
16659     여성   1967         1         1       NaN     NaN              5
16660     여성   1992         5         1     314.0   302.5              5
16661     남성   1995         5         1       NaN     NaN              5
16662     여성   1998         0         1       NaN     NaN              5
16663     남성   2001         0         1       NaN     NaN              5

[16664 rows x 7 columns]

<age 컬럼 추가>
      gender  birth  marriage  religion  code_job  income  code_religion