# Creating the Dataset

In [48]:
import tabula
import pandas as pd
import numpy as np
import re

In [49]:
hanja_file = 'hanja.pdf'
list_hanja = tabula.read_pdf(hanja_file, pages = "all", multiple_tables = True)

In [50]:
len(list_hanja)

53

In [51]:
type(list_hanja)

list

In [52]:
list_hanja[0]

Unnamed: 0,0,1
0,,한문 교육용 인명용 추가 한자 및 허용 한자
1,한글,기초한자
2,,(2007.8.현재) 별표1 별표2
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸
4,가,
5,,加價假架暇 珈
6,,各角脚閣却
7,각,珏恪殼愨 愨(慤)
8,,覺刻
9,,干間看刊肝


In [53]:
type(list_hanja[0])

pandas.core.frame.DataFrame

The list_hanja is a list made up of 53 dataframes, we need to combine these and clean it up.

In [54]:
df_hanja = pd.DataFrame()

In [55]:
for i in range(len(list_hanja)):
    df_hanja = df_hanja.append(list_hanja[i])
df_hanja

Unnamed: 0,0,1,2,3,4,5
0,,한문 교육용 인명용 추가 한자 및 허용 한자,,,,
1,한글,기초한자,,,,
2,,(2007.8.현재) 별표1 별표2,,,,
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,


There's a lot of superfluous data, so we will remove them.

In [56]:
df_hanja = df_hanja[df_hanja[0] != '한글']
df_hanja = df_hanja[df_hanja[2] != '별표1']
df_hanja = df_hanja[2:]
df_hanja

Unnamed: 0,0,1,2,3,4,5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎),,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


In [57]:
df_hanja = df_hanja.rename(columns={0:'hangul', 1:'hanja', 2:'hanja2', 3:'hanja3', 4:'hanja4', 5:'hanja5'})

In [58]:
df_hanja.head(25)

Unnamed: 0,hangul,hanja,hanja2,hanja3,hanja4,hanja5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎),,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


The Hanja is out of step with the corresponding Hangul, we will fix this next. It seems part of the Hanja appears above and below the Hangul resulting in a NaN. We have to do this manually because of the inconsistency

In [59]:
df_hanja.iat[1,1] = str(df_hanja.iat[0,1]) + str(df_hanja.iat[2,1])
df_hanja.iat[4,1] += str(df_hanja.iat[3,1]) + str(df_hanja.iat[5,1])
df_hanja.iat[7,1] += str(df_hanja.iat[6,1]) + str(df_hanja.iat[8,1])
df_hanja.iat[11,1] += str(df_hanja.iat[10,1]) + str(df_hanja.iat[12,1])
df_hanja.iat[15,1] = str(df_hanja.iat[14,1]) + str(df_hanja.iat[16,1])
df_hanja.iat[18,1] += str(df_hanja.iat[17,1]) + str(df_hanja.iat[19,1])
df_hanja

Unnamed: 0,hangul,hanja,hanja2,hanja3,hanja4,hanja5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤)各角脚閣却覺刻,,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇,,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


Combine all hanja columns

In [60]:
df_hanja = df_hanja[pd.notnull(df_hanja['hangul'])]
df_hanja = df_hanja.replace(np.nan, '', regex=True)
df_hanja['hanja'] += df_hanja['hanja2'] + df_hanja['hanja3'] + df_hanja['hanja4'] + df_hanja['hanja5']
del df_hanja['hanja2'], df_hanja['hanja3'], df_hanja['hanja4'], df_hanja['hanja5']
df_hanja

Unnamed: 0,hangul,hanja
4,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈
7,각,珏恪殼愨 愨(慤)各角脚閣却覺刻
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇
12,갈,渴 葛乫喝曷碣竭褐蝎鞨
14,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑(鑒)甘減感敢監鑑
16,갑,甲 鉀匣岬胛閘
18,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強(强) 鋼(鎠) 岡(崗)剛鋼綱...
21,개,价凱愷漑塏愾疥芥豈鎧玠 個(箇) 蓋(盖)改皆個開介慨槪蓋
2,객,客喀
3,갱,更坑粳羹


Finally we need to reset the index

In [61]:
df_hanja = df_hanja.reset_index()
df_hanja = df_hanja.iloc[:,1:3]
df_hanja

Unnamed: 0,hangul,hanja
0,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈
1,각,珏恪殼愨 愨(慤)各角脚閣却覺刻
2,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇
3,갈,渴 葛乫喝曷碣竭褐蝎鞨
4,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑(鑒)甘減感敢監鑑
5,갑,甲 鉀匣岬胛閘
6,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強(强) 鋼(鎠) 岡(崗)剛鋼綱...
7,개,价凱愷漑塏愾疥芥豈鎧玠 個(箇) 蓋(盖)改皆個開介慨槪蓋
8,객,客喀
9,갱,更坑粳羹


In [62]:
len(df_hanja)

453

We have 453 name parts, from which we can construct over 200,000 names, this is too computationally expensive for testing purposes. We will assume that the number of hanja correlates with how often the syllable appears in names.

Get rid of brackets around characters

In [63]:
df_hanja = df_hanja.replace('\(', '', regex=True)
df_hanja = df_hanja.replace('\)', '', regex=True)

This is a little more managable, we will contruct all possible 2-syllable names from these. It seems like our assumption was relatively sound, there are many common name particles present in this data set.

In [68]:
df_hanja[(df_hanja['hangul'] == '수') | (df_hanja['hangul'] == '현')]

Unnamed: 0,hangul,hanja,hanja_count
212,수,水手受授首\r守收誰須雖\r愁樹壽數修\r秀囚需帥殊\r隨輸獸睡遂\r垂搜洙琇銖粹穗繡隋髓袖...,92
416,현,現賢玄絃縣\r懸顯見峴晛泫炫玹鉉眩昡絢呟俔睍舷衒弦儇譞怰䧋\r鋗㢺琄嬛娊妶灦㭹顯顕,39


In [80]:
df_names = pd.DataFrame(columns=['syllable_1','syllable_2'])

In [82]:
x = 0

for i in range(len(df_subset)):
    for j in range(len(df_subset)):
        df_names.at[x, 'syllable_1'] = df_hanja.at[i,'hangul']
        df_names.at[x, 'syllable_2'] = df_hanja.at[j, 'hangul']
        x+=1

df_names.head(10)

Unnamed: 0,syllable_1,syllable_2
0,가,가
1,가,각
2,가,간
3,가,갈
4,가,감
5,가,갑
6,가,강
7,가,개
8,가,객
9,가,갱


In [83]:
len(df_names)

13456

In [84]:
df_names['name'] = df_names['syllable_1'] + df_names['syllable_2']
cols = df_names.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_names = df_names[cols]
df_names.head(10)

Unnamed: 0,name,syllable_1,syllable_2
0,가가,가,가
1,가각,가,각
2,가간,가,간
3,가갈,가,갈
4,가감,가,감
5,가갑,가,갑
6,가강,가,강
7,가개,가,개
8,가객,가,객
9,가갱,가,갱


# Mining Name Data from the Website

In [85]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [86]:
def convert_hangul(name):
    hangul_string = str(name.encode('utf-8'))
    hangul_string = hangul_string.replace("\\", "").replace("x","%").replace("'","").upper()[1:len(hangul_string)]
    return hangul_string

In [87]:
address_templates = ['http://www.erumy.com/nameAnalyze/AnalyzeMyName.aspx?name=%EA%B9%80','http://www.erumy.com/nameclub/NameResultList.aspx?name=','http://www.erumy.com/nameclub/NameResultChartYear.aspx?name=','http://www.erumy.com/nameclub/NameResultChartLoc.aspx?name=']

In [88]:
def construct_address(name):
    name = convert_hangul(name)
    addresses = [x+name for x in address_templates]
    return addresses

In [89]:
website_list = construct_address('시안')
website_list

['http://www.erumy.com/nameAnalyze/AnalyzeMyName.aspx?name=%EA%B9%80%EC%8B%9C%EC%95%88',
 'http://www.erumy.com/nameclub/NameResultList.aspx?name=%EC%8B%9C%EC%95%88',
 'http://www.erumy.com/nameclub/NameResultChartYear.aspx?name=%EC%8B%9C%EC%95%88',
 'http://www.erumy.com/nameclub/NameResultChartLoc.aspx?name=%EC%8B%9C%EC%95%88']

## Preparing Code to Extract Key Data from the Website

### Gender Data

In [91]:
page = urlopen(website_list[0])
soup = BeautifulSoup(page, 'html.parser')
soup_list = soup.find_all('td')
soup_list[3]

<td>
                    남자 <img align="absmidde" alt="" height="10" src="http://common.erumy.com/image/table/bar.gif" width="197px"/><br/>
                    여자 <img align="absmidde" alt="" height="10" src="http://common.erumy.com/image/table/bar_blue.gif" width="252px"/><br/>
                    시안은 56.02% 여성적인 이름입니다.<br/>
                    남자 이름중에서 603번, 여자 이름중에서는 768번 사용되어졌습니다.
                </td>

This is the key line where we extract gender data.

In [92]:
gender_string = str(soup_list[3])
result = re.findall(r"\.?([\d]+)\번", gender_string, re.IGNORECASE | re.MULTILINE)
result

['603', '768']

These are the 2 numbers we need.

### Surname Data

In [93]:
page = urlopen(website_list[1])
soup = BeautifulSoup(page, 'html.parser')
soup_data = soup.find_all('a')
soup_data

[<a href="/nameAnalyze/eDefault.aspx?name=%ea%b0%95%ec%8b%9c%ec%95%88" target="_blank">강시안(8)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b3%a0%ec%8b%9c%ec%95%88" target="_blank">고시안(2)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b6%8c%ec%8b%9c%ec%95%88" target="_blank">권시안(10)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b8%b8%ec%8b%9c%ec%95%88" target="_blank">길시안(1)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b9%80%ec%8b%9c%ec%95%88" target="_blank">김시안(242)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%82%a8%ec%8b%9c%ec%95%88" target="_blank">남시안(5)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%85%b8%ec%8b%9c%ec%95%88" target="_blank">노시안(1)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%a5%98%ec%8b%9c%ec%95%88" target="_blank">류시안(8)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%a6%ac%ec%8b%9c%ec%95%88" target="_blank">리시안(1)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%ac%b8%ec%8b%9c%ec%95%88" target="_blank">문시안(1)</a>,
 <a hre

We need to extract the surname in front of the name and the number afterward.

In [94]:
surname_data = str(soup_data).split(',')
surname_data = surname_data[:len(surname_data)-1]

In [95]:
pattern = re.compile(r'(?<=\>)(.*?)(?=\<)')
result = [m.group(1) for m in (pattern.search(surname) for surname in surname_data) if m]
result[:5]

['강시안(8)', '고시안(2)', '권시안(10)', '길시안(1)', '김시안(242)']

In [96]:
name = '시안'
sub_surname_dict = {}
surname_dict = {}

for i in range(len(result)):
    s = result[i]
    r = s.split('(')[1]
    sub_surname_dict[s[:1]] = r[:len(r)-1]
    surname_dict[name] = sub_surname_dict

surname_dict

{'시안': {'강': '8',
  '고': '2',
  '권': '10',
  '길': '1',
  '김': '242',
  '남': '5',
  '노': '1',
  '류': '8',
  '리': '1',
  '문': '1',
  '민': '27',
  '박': '147',
  '방': '5',
  '배': '16',
  '백': '29',
  '복': '2',
  '부': '1',
  '서': '15',
  '석': '2',
  '선': '2',
  '설': '18',
  '성': '4',
  '손': '4',
  '송': '11',
  '신': '6',
  '안': '3',
  '양': '12',
  '엄': '3',
  '연': '51',
  '오': '57',
  '옥': '1',
  '우': '5',
  '유': '111',
  '윤': '9',
  '이': '259',
  '임': '13',
  '장': '14',
  '전': '12',
  '정': '83',
  '조': '10',
  '좌': '2',
  '주': '5',
  '지': '1',
  '진': '9',
  '차': '12',
  '채': '4',
  '천': '3',
  '최': '78',
  '편': '1',
  '표': '1',
  '하': '16',
  '한': '18',
  '허': '17',
  '현': '2',
  '형': '11',
  '홍': '9',
  '황': '11'}}

This dictionary compiles all the data we need for surname data.

### Year Data

In [97]:
page = urlopen(website_list[2])
soup = BeautifulSoup(page, 'html.parser')
soup


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html lang="ko" xml:lang="ko" xmlns="http://www.w3.org/1999/xhtml">
<head id="ctl00_Head1"><title>
	
		
		
		작명No1. 이루미작명 :: 작명,개명,이름풀이
</title><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="erumy.com" name="author"/><meta content="no-cache" http-equiv="cache-control"/><meta content="no-cache" http-equiv="pragma"/><meta content="작명, 작명소, 이름짓기, 이름풀이, 무료이름풀이, 개명, 아기이름, 이름, 이쁜이름 " name="keywords"/><meta content="작명No1. 이루미작명 :: 작명,개명,이름풀이 :: NAMESTORY" name="description"/>
<!-- facebook meta tag -->
<meta content="작명No1. 이루미작명 :: 작명,개명,이름풀이" property="og:title"/><meta content="website" property="og:type"/><meta content="http://common.erumy.com/image/logo/erumy_logo_50x50.gif" property="og:image"/><meta content="http://www.erumy.com" property="og:url"/><meta content="작명, 개명, 이름, 이름풀이" property="og:tag name"/><meta content="100000962

The relevant data we need here is on line 78, however it appears within script tags

In [98]:
soup_categories = soup.body('script')[0]

In [99]:
categories = str(soup_categories).split('+')[6]
categories

' "   <categories><category label=\'2019년\' /><category label=\'2018년\' /><category label=\'2017년\' /><category label=\'2016년\' /><category label=\'2015년\' /><category label=\'2014년\' /><category label=\'2013년\' /><category label=\'2012년\' /><category label=\'2011년\' /><category label=\'2010년\' /><category label=\'2009년\' /><category label=\'2008년\' /><category label=\'2007년\' /><category label=\'2006년\' /><category label=\'2005년\' /><category label=\'2004년\' /><category label=\'2003년\' /><category label=\'2002년\' /><category label=\'2001년\' /><category label=\'2000년\' /><category label=\'1999년\' /><category label=\'1998년\' /><category label=\'1997년\' /><category label=\'1996년\' /><category label=\'1995년\' /><category label=\'1994년\' /><category label=\'1993년\' /><category label=\'1992년\' /><category label=\'1991년\' /><category label=\'1990년\' /><category label=\'1989년\' /><category label=\'1988년\' /><category label=\'1987년\' /><category label=\'1986년\' /><category label=\'1985년\' /><c

In [100]:
year_dict = {}

year_result = re.findall(r"\.?([\d]+)\년", categories, re.IGNORECASE | re.MULTILINE)
value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
zip_result = zip(year_result, value_result)
sub_year_dict = dict(zip_result)
year_dict[name] = sub_year_dict
year_dict

{'시안': {'2019': '22',
  '2018': '160',
  '2017': '88',
  '2016': '158',
  '2015': '74',
  '2014': '53',
  '2013': '49',
  '2012': '87',
  '2011': '61',
  '2010': '39',
  '2009': '30',
  '2008': '8',
  '2007': '10',
  '2006': '40',
  '2005': '10',
  '2004': '3',
  '2003': '5',
  '2002': '3',
  '2001': '7',
  '2000': '5',
  '1999': '16',
  '1998': '5',
  '1997': '8',
  '1996': '58',
  '1995': '10',
  '1994': '12',
  '1993': '8',
  '1992': '27',
  '1991': '9',
  '1990': '38',
  '1989': '25',
  '1988': '104',
  '1987': '16',
  '1986': '13',
  '1985': '9',
  '1984': '3',
  '1983': '5',
  '1982': '13',
  '1981': '6',
  '1980': '12',
  '1979': '5',
  '1978': '3',
  '1977': '35',
  '1976': '5',
  '1975': '5',
  '1974': '1',
  '1973': '3',
  '1972': '10',
  '1971': '3',
  '1970': '4',
  '1969': '2',
  '1968': '2',
  '1967': '3',
  '1965': '1',
  '1964': '3',
  '1963': '3',
  '1962': '1',
  '1961': '2',
  '1960': '1',
  '1959': '7',
  '1957': '1',
  '1956': '1',
  '1941': '1'}}

This dictionary contains all the year data we need.

### Location Data

In [101]:
page = urlopen(website_list[3])
soup = BeautifulSoup(page, 'html.parser')
soup_categories = soup.body('script')[0]
categories = str(soup_categories).split('+')[6]
categories

' "   <categories><category label=\'강원도\' /><category label=\'경기도\' /><category label=\'경상남도\' /><category label=\'경상북도\' /><category label=\'광주광역시\' /><category label=\'대구광역시\' /><category label=\'대전광역시\' /><category label=\'부산광역시\' /><category label=\'서울특별시\' /><category label=\'울산광역시\' /><category label=\'인천광역시\' /><category label=\'전라남도\' /><category label=\'전라북도\' /><category label=\'제주도\' /><category label=\'충청남도\' /><category label=\'충청북도\' /><category label=\'해외\' /></categories><dataset seriesName=\'인원\' color=\'90a5ff\'><set value=\'59\' color=\'90a5ff\' toolText=\'59명\'  /><set value=\'402\' color=\'90a5ff\' toolText=\'402명\'  /><set value=\'63\' color=\'90a5ff\' toolText=\'63명\'  /><set value=\'67\' color=\'90a5ff\' toolText=\'67명\'  /><set value=\'84\' color=\'90a5ff\' toolText=\'84명\'  /><set value=\'75\' color=\'90a5ff\' toolText=\'75명\'  /><set value=\'52\' color=\'90a5ff\' toolText=\'52명\'  /><set value=\'114\' color=\'90a5ff\' toolText=\'114명\'  /><set value=\'300\' c

In [102]:
location_dict = {}


loc_result = re.findall(r"\ label=\'.*?\'", categories, re.IGNORECASE | re.MULTILINE)
loc_result = {x.replace(' label=', '').replace("'", "") for x in loc_result}
value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
zip_result = zip(loc_result, value_result)
sub_loc_dict = dict(zip_result)
location_dict[name] = sub_loc_dict
location_dict

{'시안': {'대전광역시': '59',
  '경상남도': '402',
  '광주광역시': '63',
  '인천광역시': '67',
  '대구광역시': '84',
  '전라남도': '75',
  '전라북도': '52',
  '충청북도': '114',
  '제주도': '300',
  '서울특별시': '23',
  '경상북도': '52',
  '해외': '27',
  '울산광역시': '6',
  '강원도': '14',
  '충청남도': '29',
  '부산광역시': '17',
  '경기도': '27'}}

This is our location data. Now that we have demonstrated how the data will be extracted from the website, we will continue creating our Korean Name dataset.

### Turning These Processes into Functions

In [144]:
def get_gender_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[0])
    soup = BeautifulSoup(page, 'html.parser')
    soup_list = soup.find_all('td')
    gender_string = str(soup_list[3])
    
    g_dict = {}
    
    result1 = re.findall(r"\.?([\d]+)\번", gender_string, re.IGNORECASE | re.MULTILINE)
    g_dict[x] = result1
    
    return g_dict

In [145]:
def get_surname_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[1])
    soup = BeautifulSoup(page, 'html.parser')
    soup_data = soup.find_all('a')
    surname_data = str(soup_data).split(',')
    surname_data = surname_data[:len(surname_data)-1]
    pattern = re.compile(r'(?<=\>)(.*?)(?=\<)')
    result = [m.group(1) for m in (pattern.search(surname) for surname in surname_data) if m]
   
    result2 = {}
    s_dict = {}

    for i in range(len(result)):
        s = result[i]
        r = s.split('(')[1]
        result2[s[:1]] = r[:len(r)-1]
    s_dict[x] = result2

    return s_dict

In [152]:
def get_year_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[2])
    soup = BeautifulSoup(page, 'html.parser')
    soup_categories = soup.body('script')[0]
    categories = str(soup_categories).split('+')[6]
    
    y_dict = {}

    year_result = re.findall(r"\.?([\d]+)\년", categories, re.IGNORECASE | re.MULTILINE)
    value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
    zip_result = zip(year_result, value_result)
    result3 = dict(zip_result)
    y_dict[x] = result3
    
    return y_dict

In [147]:
def get_loc_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[3])
    soup = BeautifulSoup(page, 'html.parser')
    soup_categories = soup.body('script')[0]
    categories = str(soup_categories).split('+')[6]
    
    l_dict = {}

    loc_result = re.findall(r"\ label=\'.*?\'", categories, re.IGNORECASE | re.MULTILINE)
    loc_result = {x.replace(' label=', '').replace("'", "") for x in loc_result}
    value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
    zip_result = zip(loc_result, value_result)
    result4 = dict(zip_result)
    l_dict[x] = result4
    
    return l_dict

## Generating the Dataset

In [148]:
df_sample = df_names.head(5)

In [149]:
df_sample['name'].apply(get_gender_data)

Now analysing: 가가
Now analysing: 가각
Now analysing: 가간
Now analysing: 가갈
Now analysing: 가감


0    {'가가': ['67', '24']}
1    {'가각': ['259', '1']}
2     {'가간': ['15', '0']}
3      {'가갈': ['7', '0']}
4      {'가감': ['5', '0']}
Name: name, dtype: object

In [150]:
df_sample['name'].apply(get_surname_data)

Now analysing: 가가
Now analysing: 가각
Now analysing: 가간
Now analysing: 가갈
Now analysing: 가감


0    {'가가': {'가': '6', '고': '1', '김': '6', '박': '1'...
1    {'가각': {'감': '18', '갑': '6', '강': '23', '개': '...
2                        {'가간': {'김': '1', '허': '19'}}
3                                   {'가갈': {'허': '7'}}
4                                   {'가감': {'허': '5'}}
Name: name, dtype: object

In [153]:
df_sample['name'].apply(get_year_data)

Now analysing: 가가
Now analysing: 가각
Now analysing: 가간
Now analysing: 가갈
Now analysing: 가감


0    {'가가': {'2015': '60', '2012': '1', '2010': '5'...
1    {'가각': {'2019': '6', '1994': '1', '1984': '1',...
2                  {'가간': {'2019': '19', '2008': '1'}}
3                                {'가갈': {'2019': '7'}}
4                                {'가감': {'2019': '5'}}
Name: name, dtype: object

In [154]:
df_sample['name'].apply(get_loc_data)

Now analysing: 가가
Now analysing: 가각
Now analysing: 가간
Now analysing: 가갈
Now analysing: 가감


0    {'가가': {'전라남도': '5', '광주광역시': '1', '전라북도': '1'...
1    {'가각': {'서울특별시': '1', '부산광역시': '7', '경기도': '47...
2                  {'가간': {'부산광역시': '1', '경기도': '19'}}
3                               {'가갈': {'부산광역시': '7'}}
4                               {'가감': {'부산광역시': '5'}}
Name: name, dtype: object