# Creating the Dataset

In [1]:
import tabula
import pandas as pd
import numpy as np
import re

In [2]:
hanja_file = 'hanja.pdf'
list_hanja = tabula.read_pdf(hanja_file, pages = "all", multiple_tables = True)

In [3]:
len(list_hanja)

53

In [4]:
type(list_hanja)

list

In [5]:
list_hanja[0]

Unnamed: 0,0,1
0,,한문 교육용 인명용 추가 한자 및 허용 한자
1,한글,기초한자
2,,(2007.8.현재) 별표1 별표2
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸
4,가,
5,,加價假架暇 珈
6,,各角脚閣却
7,각,珏恪殼愨 愨(慤)
8,,覺刻
9,,干間看刊肝


In [6]:
type(list_hanja[0])

pandas.core.frame.DataFrame

The list_hanja is a list made up of 53 dataframes, we need to combine these and clean it up.

In [7]:
df_hanja = pd.DataFrame()

In [8]:
for i in range(len(list_hanja)):
    df_hanja = df_hanja.append(list_hanja[i])
df_hanja

Unnamed: 0,0,1,2,3,4,5
0,,한문 교육용 인명용 추가 한자 및 허용 한자,,,,
1,한글,기초한자,,,,
2,,(2007.8.현재) 별표1 별표2,,,,
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,


There's a lot of superfluous data, so we will remove them.

In [9]:
df_hanja = df_hanja[df_hanja[0] != '한글']
df_hanja = df_hanja[df_hanja[2] != '별표1']
df_hanja = df_hanja[2:]
df_hanja

Unnamed: 0,0,1,2,3,4,5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎),,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


In [10]:
df_hanja = df_hanja.rename(columns={0:'hangul', 1:'hanja', 2:'hanja2', 3:'hanja3', 4:'hanja4', 5:'hanja5'})

In [11]:
df_hanja.head(25)

Unnamed: 0,hangul,hanja,hanja2,hanja3,hanja4,hanja5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎),,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


The Hanja is out of step with the corresponding Hangul, we will fix this next. It seems part of the Hanja appears above and below the Hangul resulting in a NaN. We have to do this manually because of the inconsistency

In [12]:
df_hanja.iat[1,1] = str(df_hanja.iat[0,1]) + str(df_hanja.iat[2,1])
df_hanja.iat[4,1] += str(df_hanja.iat[3,1]) + str(df_hanja.iat[5,1])
df_hanja.iat[7,1] += str(df_hanja.iat[6,1]) + str(df_hanja.iat[8,1])
df_hanja.iat[11,1] += str(df_hanja.iat[10,1]) + str(df_hanja.iat[12,1])
df_hanja.iat[15,1] = str(df_hanja.iat[14,1]) + str(df_hanja.iat[16,1])
df_hanja.iat[18,1] += str(df_hanja.iat[17,1]) + str(df_hanja.iat[19,1])
df_hanja

Unnamed: 0,hangul,hanja,hanja2,hanja3,hanja4,hanja5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤)各角脚閣却覺刻,,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇,,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


Combine all hanja columns

In [13]:
df_hanja = df_hanja[pd.notnull(df_hanja['hangul'])]
df_hanja = df_hanja.replace(np.nan, '', regex=True)
df_hanja['hanja'] += df_hanja['hanja2'] + df_hanja['hanja3'] + df_hanja['hanja4'] + df_hanja['hanja5']
del df_hanja['hanja2'], df_hanja['hanja3'], df_hanja['hanja4'], df_hanja['hanja5']
df_hanja

Unnamed: 0,hangul,hanja
4,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈
7,각,珏恪殼愨 愨(慤)各角脚閣却覺刻
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇
12,갈,渴 葛乫喝曷碣竭褐蝎鞨
14,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑(鑒)甘減感敢監鑑
16,갑,甲 鉀匣岬胛閘
18,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強(强) 鋼(鎠) 岡(崗)剛鋼綱...
21,개,价凱愷漑塏愾疥芥豈鎧玠 個(箇) 蓋(盖)改皆個開介慨槪蓋
2,객,客喀
3,갱,更坑粳羹


Finally we need to reset the index

In [14]:
df_hanja = df_hanja.reset_index()
df_hanja = df_hanja.iloc[:,1:3]
df_hanja

Unnamed: 0,hangul,hanja
0,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈
1,각,珏恪殼愨 愨(慤)各角脚閣却覺刻
2,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇
3,갈,渴 葛乫喝曷碣竭褐蝎鞨
4,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑(鑒)甘減感敢監鑑
5,갑,甲 鉀匣岬胛閘
6,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強(强) 鋼(鎠) 岡(崗)剛鋼綱...
7,개,价凱愷漑塏愾疥芥豈鎧玠 個(箇) 蓋(盖)改皆個開介慨槪蓋
8,객,客喀
9,갱,更坑粳羹


In [15]:
len(df_hanja)

453

We have 453 name parts, from which we can construct over 200,000 names, this is too computationally expensive for testing purposes. We will assume that the number of hanja correlates with how often the syllable appears in names.

Get rid of brackets around characters

In [16]:
df_hanja = df_hanja.replace('\(', '', regex=True)
df_hanja = df_hanja.replace('\)', '', regex=True)

Let's take a subset of this dataset

In [17]:
df_hanja['hanja_count'] = df_hanja['hanja'].apply(lambda x : len(x))
df_hanja

Unnamed: 0,hangul,hanja,hanja_count
0,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈,35
1,각,珏恪殼愨 愨慤各角脚閣却覺刻,14
2,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆桿 癇癎干間看刊肝幹簡姦懇,33
3,갈,渴 葛乫喝曷碣竭褐蝎鞨,11
4,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑鑒甘減感敢監鑑,23
5,갑,甲 鉀匣岬胛閘,7
6,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強强 鋼鎠 岡崗剛鋼綱 鏹 襁,44
7,개,价凱愷漑塏愾疥芥豈鎧玠 個箇 蓋盖改皆個開介慨槪蓋,25
8,객,客喀,2
9,갱,更坑粳羹,4


In [18]:
df_hanja['hanja_count'].describe()

count    453.000000
mean      12.997792
std       14.257974
min        1.000000
25%        4.000000
50%        7.000000
75%       18.000000
max       92.000000
Name: hanja_count, dtype: float64

In [19]:
df_hanja_2 = df_hanja.loc[(df_hanja['hanja_count']>18)]

In [20]:
df_hanja_subset = df_hanja_2.iloc[:,0:1]
df_hanja_subset

Unnamed: 0,hangul
0,가
2,간
4,감
6,강
7,개
12,건
20,경
21,계
22,고
26,공


Combine all syllables to form all possible names

In [21]:
list_part_1 = []

i = 0
while i < len(df_hanja_subset):
    list_part_1.append(list(df_hanja_subset['hangul']))
    i+=1

flat_list = [item for sublist in list_part_1 for item in sublist]

In [22]:
list_part_2 = list(np.repeat(df_hanja_subset.values,len(df_hanja_subset)))

In [23]:
df_names = pd.DataFrame(list(map(list, zip(flat_list, list_part_2))), columns=['syllable_1','syllable_2'])
df_names

Unnamed: 0,syllable_1,syllable_2
0,가,가
1,간,가
2,감,가
3,강,가
4,개,가
5,건,가
6,경,가
7,계,가
8,고,가
9,공,가


In [24]:
len(df_names)

12321

In [25]:
df_names['name'] = df_names['syllable_1'] + df_names['syllable_2']
cols = df_names.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_names = df_names[cols]
df_names.head(10)

Unnamed: 0,name,syllable_1,syllable_2
0,가가,가,가
1,간가,간,가
2,감가,감,가
3,강가,강,가
4,개가,개,가
5,건가,건,가
6,경가,경,가
7,계가,계,가
8,고가,고,가
9,공가,공,가


# Mining Name Data from the Website

In [36]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [33]:
def convert_hangul(name):
    hangul_string = str(name.encode('utf-8'))
    hangul_string = hangul_string.replace("\\", "").replace("x","%").replace("'","").upper()[1:len(hangul_string)]
    return hangul_string

In [34]:
address_templates = ['http://www.erumy.com/nameAnalyze/AnalyzeMyName.aspx?name=%EA%B9%80','http://www.erumy.com/nameclub/NameResultList.aspx?name=','http://www.erumy.com/nameclub/NameResultChartYear.aspx?name=','http://www.erumy.com/nameclub/NameResultChartLoc.aspx?name=']

In [35]:
def construct_address(name):
    name = convert_hangul(name)
    addresses = [x+name for x in address_templates]
    return addresses

In [30]:
website_list = construct_address('시안')
website_list

['http://www.erumy.com/nameAnalyze/AnalyzeMyName.aspx?name=%EA%B9%80%EC%8B%9C%EC%95%88',
 'http://www.erumy.com/nameclub/NameResultList.aspx?name=%EC%8B%9C%EC%95%88',
 'http://www.erumy.com/nameclub/NameResultChartYear.aspx?name=%EC%8B%9C%EC%95%88',
 'http://www.erumy.com/nameclub/NameResultChartLoc.aspx?name=%EC%8B%9C%EC%95%88']

## Preparing Code to Extract Key Data from the Website

### Gender Data

In [31]:
page = urlopen(website_list[0])
soup = BeautifulSoup(page, 'html.parser')
soup_list = soup.find_all('td')
soup_list[3]

<td>
                    남자 <img align="absmidde" alt="" height="10" src="http://common.erumy.com/image/table/bar.gif" width="197px"/><br/>
                    여자 <img align="absmidde" alt="" height="10" src="http://common.erumy.com/image/table/bar_blue.gif" width="252px"/><br/>
                    시안은 56.05% 여성적인 이름입니다.<br/>
                    남자 이름중에서 603번, 여자 이름중에서는 769번 사용되어졌습니다.
                </td>

This is the key line where we extract gender data.

In [32]:
gender_string = str(soup_list[3])
result = re.findall(r"\.?([\d]+)\번", gender_string, re.IGNORECASE | re.MULTILINE)
result

['603', '769']

These are the 2 numbers we need.

### Surname Data

In [33]:
page = urlopen(website_list[1])
soup = BeautifulSoup(page, 'html.parser')
soup_data = soup.find_all('a')
soup_data

[<a href="/nameAnalyze/eDefault.aspx?name=%ea%b0%95%ec%8b%9c%ec%95%88" target="_blank">강시안(8)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b3%a0%ec%8b%9c%ec%95%88" target="_blank">고시안(2)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b6%8c%ec%8b%9c%ec%95%88" target="_blank">권시안(10)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b8%b8%ec%8b%9c%ec%95%88" target="_blank">길시안(1)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%ea%b9%80%ec%8b%9c%ec%95%88" target="_blank">김시안(242)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%82%a8%ec%8b%9c%ec%95%88" target="_blank">남시안(5)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%85%b8%ec%8b%9c%ec%95%88" target="_blank">노시안(1)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%a5%98%ec%8b%9c%ec%95%88" target="_blank">류시안(8)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%a6%ac%ec%8b%9c%ec%95%88" target="_blank">리시안(1)</a>,
 <a href="/nameAnalyze/eDefault.aspx?name=%eb%ac%b8%ec%8b%9c%ec%95%88" target="_blank">문시안(1)</a>,
 <a hre

We need to extract the surname in front of the name and the number afterward.

In [34]:
surname_data = str(soup_data).split(',')
surname_data = surname_data[:len(surname_data)-1]

In [35]:
pattern = re.compile(r'(?<=\>)(.*?)(?=\<)')
result = [m.group(1) for m in (pattern.search(surname) for surname in surname_data) if m]
result[:5]

['강시안(8)', '고시안(2)', '권시안(10)', '길시안(1)', '김시안(242)']

In [36]:
name = '시안'
sub_surname_dict = {}
surname_dict = {}

for i in range(len(result)):
    s = result[i]
    r = s.split('(')[1]
    sub_surname_dict[s[:1]] = r[:len(r)-1]
    surname_dict[name] = sub_surname_dict

surname_dict

{'시안': {'강': '8',
  '고': '2',
  '권': '10',
  '길': '1',
  '김': '242',
  '남': '5',
  '노': '1',
  '류': '8',
  '리': '1',
  '문': '1',
  '민': '27',
  '박': '147',
  '방': '5',
  '배': '16',
  '백': '29',
  '복': '2',
  '부': '1',
  '서': '15',
  '석': '2',
  '선': '2',
  '설': '18',
  '성': '4',
  '손': '4',
  '송': '11',
  '신': '6',
  '안': '3',
  '양': '12',
  '엄': '3',
  '연': '51',
  '오': '57',
  '옥': '1',
  '우': '5',
  '유': '111',
  '윤': '9',
  '이': '259',
  '임': '14',
  '장': '14',
  '전': '12',
  '정': '83',
  '조': '10',
  '좌': '2',
  '주': '5',
  '지': '1',
  '진': '9',
  '차': '12',
  '채': '4',
  '천': '3',
  '최': '78',
  '편': '1',
  '표': '1',
  '하': '16',
  '한': '18',
  '허': '17',
  '현': '2',
  '형': '11',
  '홍': '9',
  '황': '11'}}

This dictionary compiles all the data we need for surname data.

### Year Data

In [37]:
page = urlopen(website_list[2])
soup = BeautifulSoup(page, 'html.parser')
soup


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html lang="ko" xml:lang="ko" xmlns="http://www.w3.org/1999/xhtml">
<head id="ctl00_Head1"><title>
	
		
		
		작명No1. 이루미작명 :: 작명,개명,이름풀이
</title><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="erumy.com" name="author"/><meta content="no-cache" http-equiv="cache-control"/><meta content="no-cache" http-equiv="pragma"/><meta content="작명, 작명소, 이름짓기, 이름풀이, 무료이름풀이, 개명, 아기이름, 이름, 이쁜이름 " name="keywords"/><meta content="작명No1. 이루미작명 :: 작명,개명,이름풀이 :: NAMESTORY" name="description"/>
<!-- facebook meta tag -->
<meta content="작명No1. 이루미작명 :: 작명,개명,이름풀이" property="og:title"/><meta content="website" property="og:type"/><meta content="http://common.erumy.com/image/logo/erumy_logo_50x50.gif" property="og:image"/><meta content="http://www.erumy.com" property="og:url"/><meta content="작명, 개명, 이름, 이름풀이" property="og:tag name"/><meta content="100000962

The relevant data we need here is on line 78, however it appears within script tags

In [38]:
soup_categories = soup.body('script')[0]

In [39]:
categories = str(soup_categories).split('+')[6]
categories

' "   <categories><category label=\'2019년\' /><category label=\'2018년\' /><category label=\'2017년\' /><category label=\'2016년\' /><category label=\'2015년\' /><category label=\'2014년\' /><category label=\'2013년\' /><category label=\'2012년\' /><category label=\'2011년\' /><category label=\'2010년\' /><category label=\'2009년\' /><category label=\'2008년\' /><category label=\'2007년\' /><category label=\'2006년\' /><category label=\'2005년\' /><category label=\'2004년\' /><category label=\'2003년\' /><category label=\'2002년\' /><category label=\'2001년\' /><category label=\'2000년\' /><category label=\'1999년\' /><category label=\'1998년\' /><category label=\'1997년\' /><category label=\'1996년\' /><category label=\'1995년\' /><category label=\'1994년\' /><category label=\'1993년\' /><category label=\'1992년\' /><category label=\'1991년\' /><category label=\'1990년\' /><category label=\'1989년\' /><category label=\'1988년\' /><category label=\'1987년\' /><category label=\'1986년\' /><category label=\'1985년\' /><c

In [40]:
year_dict = {}

year_result = re.findall(r"\.?([\d]+)\년", categories, re.IGNORECASE | re.MULTILINE)
value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
zip_result = zip(year_result, value_result)
sub_year_dict = dict(zip_result)
year_dict[name] = sub_year_dict
year_dict

{'시안': {'2019': '23',
  '2018': '160',
  '2017': '88',
  '2016': '158',
  '2015': '74',
  '2014': '53',
  '2013': '49',
  '2012': '87',
  '2011': '61',
  '2010': '39',
  '2009': '30',
  '2008': '8',
  '2007': '10',
  '2006': '40',
  '2005': '10',
  '2004': '3',
  '2003': '5',
  '2002': '3',
  '2001': '7',
  '2000': '5',
  '1999': '16',
  '1998': '5',
  '1997': '8',
  '1996': '58',
  '1995': '10',
  '1994': '12',
  '1993': '8',
  '1992': '27',
  '1991': '9',
  '1990': '38',
  '1989': '25',
  '1988': '104',
  '1987': '16',
  '1986': '13',
  '1985': '9',
  '1984': '3',
  '1983': '5',
  '1982': '13',
  '1981': '6',
  '1980': '12',
  '1979': '5',
  '1978': '3',
  '1977': '35',
  '1976': '5',
  '1975': '5',
  '1974': '1',
  '1973': '3',
  '1972': '10',
  '1971': '3',
  '1970': '4',
  '1969': '2',
  '1968': '2',
  '1967': '3',
  '1965': '1',
  '1964': '3',
  '1963': '3',
  '1962': '1',
  '1961': '2',
  '1960': '1',
  '1959': '7',
  '1957': '1',
  '1956': '1',
  '1941': '1'}}

This dictionary contains all the year data we need.

### Location Data

In [41]:
page = urlopen(website_list[3])
soup = BeautifulSoup(page, 'html.parser')
soup_categories = soup.body('script')[0]
categories = str(soup_categories).split('+')[6]
categories

' "   <categories><category label=\'강원도\' /><category label=\'경기도\' /><category label=\'경상남도\' /><category label=\'경상북도\' /><category label=\'광주광역시\' /><category label=\'대구광역시\' /><category label=\'대전광역시\' /><category label=\'부산광역시\' /><category label=\'서울특별시\' /><category label=\'울산광역시\' /><category label=\'인천광역시\' /><category label=\'전라남도\' /><category label=\'전라북도\' /><category label=\'제주도\' /><category label=\'충청남도\' /><category label=\'충청북도\' /><category label=\'해외\' /></categories><dataset seriesName=\'인원\' color=\'90a5ff\'><set value=\'59\' color=\'90a5ff\' toolText=\'59명\'  /><set value=\'402\' color=\'90a5ff\' toolText=\'402명\'  /><set value=\'63\' color=\'90a5ff\' toolText=\'63명\'  /><set value=\'67\' color=\'90a5ff\' toolText=\'67명\'  /><set value=\'84\' color=\'90a5ff\' toolText=\'84명\'  /><set value=\'75\' color=\'90a5ff\' toolText=\'75명\'  /><set value=\'52\' color=\'90a5ff\' toolText=\'52명\'  /><set value=\'114\' color=\'90a5ff\' toolText=\'114명\'  /><set value=\'300\' c

In [42]:
location_dict = {}


loc_result = re.findall(r"\ label=\'.*?\'", categories, re.IGNORECASE | re.MULTILINE)
loc_result = {x.replace(' label=', '').replace("'", "") for x in loc_result}
value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
zip_result = zip(loc_result, value_result)
sub_loc_dict = dict(zip_result)
location_dict[name] = sub_loc_dict
location_dict

{'시안': {'대구광역시': '59',
  '부산광역시': '402',
  '서울특별시': '63',
  '경기도': '67',
  '충청북도': '84',
  '충청남도': '75',
  '울산광역시': '52',
  '제주도': '114',
  '경상북도': '300',
  '강원도': '24',
  '전라남도': '52',
  '경상남도': '27',
  '대전광역시': '6',
  '전라북도': '14',
  '해외': '29',
  '광주광역시': '17',
  '인천광역시': '27'}}

This is our location data. Now that we have demonstrated how the data will be extracted from the website, we will continue creating our Korean Name dataset.

### Turning These Processes into Functions

In [38]:
import time

In [39]:
def get_gender_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[0])
    soup = BeautifulSoup(page, 'html.parser')
    soup_list = soup.find_all('td')
    gender_string = str(soup_list[3])
    
    time.sleep(3)
    
    g_dict = {}
    
    result1 = re.findall(r"\.?([\d]+)\번", gender_string, re.IGNORECASE | re.MULTILINE)
    
    g_dict['male'] = result1[0]
    g_dict['female'] = result1[1]
    g_dict['name'] = x

    return g_dict

In [27]:
def get_surname_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[1])
    soup = BeautifulSoup(page, 'html.parser')
    soup_data = soup.find_all('a')
    surname_data = str(soup_data).split(',')
    surname_data = surname_data[:len(surname_data)-1]
    pattern = re.compile(r'(?<=\>)(.*?)(?=\<)')
    result = [m.group(1) for m in (pattern.search(surname) for surname in surname_data) if m]
   
    s_dict = {}

    for i in range(len(result)):
        s = result[i]
        r = s.split('(')[1]
        s_dict[s[:1]] = r[:len(r)-1]
    s_dict['name'] = x

    return s_dict

In [28]:
def get_year_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[2])
    soup = BeautifulSoup(page, 'html.parser')
    soup_categories = soup.body('script')[0]
    categories = str(soup_categories).split('+')[6]

    year_result = re.findall(r"\.?([\d]+)\년", categories, re.IGNORECASE | re.MULTILINE)
    value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
    zip_result = zip(year_result, value_result)
    y_dict = dict(zip_result)
    y_dict['name'] = x
    
    return y_dict

In [29]:
def get_loc_data(x):
    print("Now analysing:", x)
    website_list = construct_address(x)
    page = urlopen(website_list[3])
    soup = BeautifulSoup(page, 'html.parser')
    soup_categories = soup.body('script')[0]
    categories = str(soup_categories).split('+')[6]
    

    loc_result = re.findall(r"\ label=\'.*?\'", categories, re.IGNORECASE | re.MULTILINE)
    loc_result = {x.replace(' label=', '').replace("'", "") for x in loc_result}
    value_result = re.findall(r"\.?([\d]+)\명", categories, re.IGNORECASE | re.MULTILINE)
    zip_result = zip(loc_result, value_result)
    l_dict = dict(zip_result)
    l_dict['name'] = x
    
    return l_dict

## Generating the Dataset

### Sample Dataset

In [52]:
df_sample = df_names.head(5)

In [73]:
gender_dict = df_sample['name'].apply(get_gender_data)
pd.DataFrame.from_dict(gender_dict.tolist())

Now analysing: 가가
Now analysing: 간가
Now analysing: 감가
Now analysing: 강가
Now analysing: 개가


Unnamed: 0,female,male,name
0,24,67,가가
1,1,0,간가
2,0,0,감가
3,1,0,강가
4,0,0,개가


In [54]:
surname_dict = df_sample['name'].apply(get_surname_data)
pd.DataFrame.from_dict(surname_dict.tolist())

Now analysing: 가가
Now analysing: 간가
Now analysing: 감가
Now analysing: 강가
Now analysing: 개가


Unnamed: 0,name,가,고,김,박,변,손,이,임,장,정,조,최,표,한
0,가가,6.0,1.0,6.0,1.0,1.0,1.0,5.0,,3.0,1.0,60.0,1.0,3.0,2.0
1,간가,,,1.0,,,,,,,,,,,
2,감가,,,,,,,,,,,,,,
3,강가,,,,,,,,1.0,,,,,,
4,개가,,,,,,,,,,,,,,


In [55]:
year_dict = df_sample['name'].apply(get_year_data)
pd.DataFrame.from_dict(year_dict.tolist())

Now analysing: 가가
Now analysing: 간가
Now analysing: 감가
Now analysing: 강가
Now analysing: 개가


Unnamed: 0,1954,1970,1973,1980,1981,1982,1985,1987,1991,2008,2009,2010,2012,2015,name
0,3.0,3.0,1.0,1.0,2.0,1.0,7.0,1.0,1.0,1.0,4.0,5.0,1.0,60.0,가가
1,,,,,,,,,1.0,,,,,,간가
2,,,,,,,,,,,,,,,감가
3,,,,,,1.0,,,,,,,,,강가
4,,,,,,,,,,,,,,,개가


In [56]:
loc_dict = df_sample['name'].apply(get_loc_data)
pd.DataFrame.from_dict(loc_dict.tolist())

Now analysing: 가가
Now analysing: 간가
Now analysing: 감가
Now analysing: 강가
Now analysing: 개가


Unnamed: 0,name,경기도,경상북도,광주광역시,대구광역시,부산광역시,서울특별시,울산광역시,전라남도,전라북도,충청남도,충청북도
0,가가,1.0,2.0,2.0,1.0,1.0,1.0,10.0,1.0,66.0,5.0,1.0
1,간가,,,,,,1.0,,,,,
2,감가,,,,,,,,,,,
3,강가,,,,,,,1.0,,,,
4,개가,,,,,,,,,,,


### Full Dataset Generation

In [52]:
df_names_2 = df_names.head(round(len(df_names)/4))

In [53]:
gender_dict = df_names_2['name'].apply(get_gender_data)
gender_dict

Now analysing: 가가
Now analysing: 간가
Now analysing: 감가
Now analysing: 강가
Now analysing: 개가
Now analysing: 건가
Now analysing: 경가
Now analysing: 계가
Now analysing: 고가
Now analysing: 공가
Now analysing: 관가
Now analysing: 광가
Now analysing: 교가
Now analysing: 구가
Now analysing: 규가
Now analysing: 근가
Now analysing: 기가
Now analysing: 단가
Now analysing: 담가
Now analysing: 대가
Now analysing: 도가
Now analysing: 동가
Now analysing: 람가
Now analysing: 량가
Now analysing: 려가
Now analysing: 령가
Now analysing: 로가
Now analysing: 리가
Now analysing: 린가
Now analysing: 만가
Now analysing: 명가
Now analysing: 모가
Now analysing: 무가
Now analysing: 미가
Now analysing: 민가
Now analysing: 박가
Now analysing: 반가
Now analysing: 방가
Now analysing: 배가
Now analysing: 병가
Now analysing: 보가
Now analysing: 복가
Now analysing: 봉가
Now analysing: 부가
Now analysing: 분가
Now analysing: 빈가
Now analysing: 사가
Now analysing: 상가
Now analysing: 서가
Now analysing: 석가
Now analysing: 설가
Now analysing: 수가
Now analysing: 순가
Now analysing: 시가
Now analysing: 신가
Now analys

Now analysing: 교개
Now analysing: 구개
Now analysing: 규개
Now analysing: 근개
Now analysing: 기개
Now analysing: 단개
Now analysing: 담개
Now analysing: 대개
Now analysing: 도개
Now analysing: 동개
Now analysing: 람개
Now analysing: 량개
Now analysing: 려개
Now analysing: 령개
Now analysing: 로개
Now analysing: 리개
Now analysing: 린개
Now analysing: 만개
Now analysing: 명개
Now analysing: 모개
Now analysing: 무개
Now analysing: 미개
Now analysing: 민개
Now analysing: 박개
Now analysing: 반개
Now analysing: 방개
Now analysing: 배개
Now analysing: 병개
Now analysing: 보개
Now analysing: 복개
Now analysing: 봉개
Now analysing: 부개
Now analysing: 분개
Now analysing: 빈개
Now analysing: 사개
Now analysing: 상개
Now analysing: 서개
Now analysing: 석개
Now analysing: 설개
Now analysing: 수개
Now analysing: 순개
Now analysing: 시개
Now analysing: 신개
Now analysing: 아개
Now analysing: 안개
Now analysing: 양개
Now analysing: 연개
Now analysing: 영개
Now analysing: 예개
Now analysing: 오개
Now analysing: 완개
Now analysing: 요개
Now analysing: 용개
Now analysing: 우개
Now analysing: 운개
Now analys

Now analysing: 려고
Now analysing: 령고
Now analysing: 로고
Now analysing: 리고
Now analysing: 린고
Now analysing: 만고
Now analysing: 명고
Now analysing: 모고
Now analysing: 무고
Now analysing: 미고
Now analysing: 민고
Now analysing: 박고
Now analysing: 반고
Now analysing: 방고
Now analysing: 배고
Now analysing: 병고
Now analysing: 보고
Now analysing: 복고
Now analysing: 봉고
Now analysing: 부고
Now analysing: 분고
Now analysing: 빈고
Now analysing: 사고
Now analysing: 상고
Now analysing: 서고
Now analysing: 석고
Now analysing: 설고
Now analysing: 수고
Now analysing: 순고
Now analysing: 시고
Now analysing: 신고
Now analysing: 아고
Now analysing: 안고
Now analysing: 양고
Now analysing: 연고
Now analysing: 영고
Now analysing: 예고
Now analysing: 오고
Now analysing: 완고
Now analysing: 요고
Now analysing: 용고
Now analysing: 우고
Now analysing: 운고
Now analysing: 원고
Now analysing: 위고
Now analysing: 유고
Now analysing: 윤고
Now analysing: 은고
Now analysing: 의고
Now analysing: 이고
Now analysing: 인고
Now analysing: 자고
Now analysing: 재고
Now analysing: 저고
Now analysing: 적고
Now analys

Now analysing: 반교
Now analysing: 방교
Now analysing: 배교
Now analysing: 병교
Now analysing: 보교
Now analysing: 복교
Now analysing: 봉교
Now analysing: 부교
Now analysing: 분교
Now analysing: 빈교
Now analysing: 사교
Now analysing: 상교
Now analysing: 서교
Now analysing: 석교
Now analysing: 설교
Now analysing: 수교
Now analysing: 순교
Now analysing: 시교
Now analysing: 신교
Now analysing: 아교
Now analysing: 안교
Now analysing: 양교
Now analysing: 연교
Now analysing: 영교
Now analysing: 예교
Now analysing: 오교
Now analysing: 완교
Now analysing: 요교
Now analysing: 용교
Now analysing: 우교
Now analysing: 운교
Now analysing: 원교
Now analysing: 위교
Now analysing: 유교
Now analysing: 윤교
Now analysing: 은교
Now analysing: 의교
Now analysing: 이교
Now analysing: 인교
Now analysing: 자교
Now analysing: 재교
Now analysing: 저교
Now analysing: 적교
Now analysing: 전교
Now analysing: 정교
Now analysing: 제교
Now analysing: 종교
Now analysing: 주교
Now analysing: 준교
Now analysing: 지교
Now analysing: 진교
Now analysing: 차교
Now analysing: 찬교
Now analysing: 창교
Now analysing: 척교
Now analys

Now analysing: 서기
Now analysing: 석기
Now analysing: 설기
Now analysing: 수기
Now analysing: 순기
Now analysing: 시기
Now analysing: 신기
Now analysing: 아기
Now analysing: 안기
Now analysing: 양기
Now analysing: 연기
Now analysing: 영기
Now analysing: 예기
Now analysing: 오기
Now analysing: 완기
Now analysing: 요기
Now analysing: 용기
Now analysing: 우기
Now analysing: 운기
Now analysing: 원기
Now analysing: 위기
Now analysing: 유기
Now analysing: 윤기
Now analysing: 은기
Now analysing: 의기
Now analysing: 이기
Now analysing: 인기
Now analysing: 자기
Now analysing: 재기
Now analysing: 저기
Now analysing: 적기
Now analysing: 전기
Now analysing: 정기
Now analysing: 제기
Now analysing: 종기
Now analysing: 주기
Now analysing: 준기
Now analysing: 지기
Now analysing: 진기
Now analysing: 차기
Now analysing: 찬기
Now analysing: 창기
Now analysing: 척기
Now analysing: 천기
Now analysing: 초기
Now analysing: 추기
Now analysing: 치기
Now analysing: 탁기
Now analysing: 태기
Now analysing: 하기
Now analysing: 한기
Now analysing: 항기
Now analysing: 해기
Now analysing: 현기
Now analysing: 형기
Now analys

Now analysing: 예도
Now analysing: 오도
Now analysing: 완도
Now analysing: 요도
Now analysing: 용도
Now analysing: 우도
Now analysing: 운도
Now analysing: 원도
Now analysing: 위도
Now analysing: 유도
Now analysing: 윤도
Now analysing: 은도
Now analysing: 의도
Now analysing: 이도
Now analysing: 인도
Now analysing: 자도
Now analysing: 재도
Now analysing: 저도
Now analysing: 적도
Now analysing: 전도
Now analysing: 정도
Now analysing: 제도
Now analysing: 종도
Now analysing: 주도
Now analysing: 준도
Now analysing: 지도
Now analysing: 진도
Now analysing: 차도
Now analysing: 찬도
Now analysing: 창도
Now analysing: 척도
Now analysing: 천도
Now analysing: 초도
Now analysing: 추도
Now analysing: 치도
Now analysing: 탁도
Now analysing: 태도
Now analysing: 하도
Now analysing: 한도
Now analysing: 항도
Now analysing: 해도
Now analysing: 현도
Now analysing: 형도
Now analysing: 호도
Now analysing: 환도
Now analysing: 황도
Now analysing: 회도
Now analysing: 효도
Now analysing: 후도
Now analysing: 훈도
Now analysing: 희도
Now analysing: 가동
Now analysing: 간동
Now analysing: 감동
Now analysing: 강동
Now analys

Now analysing: 의려
Now analysing: 이려
Now analysing: 인려
Now analysing: 자려
Now analysing: 재려
Now analysing: 저려
Now analysing: 적려
Now analysing: 전려
Now analysing: 정려
Now analysing: 제려
Now analysing: 종려
Now analysing: 주려
Now analysing: 준려
Now analysing: 지려
Now analysing: 진려
Now analysing: 차려
Now analysing: 찬려
Now analysing: 창려
Now analysing: 척려
Now analysing: 천려
Now analysing: 초려
Now analysing: 추려
Now analysing: 치려
Now analysing: 탁려
Now analysing: 태려
Now analysing: 하려
Now analysing: 한려
Now analysing: 항려
Now analysing: 해려
Now analysing: 현려
Now analysing: 형려
Now analysing: 호려
Now analysing: 환려
Now analysing: 황려
Now analysing: 회려
Now analysing: 효려
Now analysing: 후려
Now analysing: 훈려
Now analysing: 희려
Now analysing: 가령
Now analysing: 간령
Now analysing: 감령
Now analysing: 강령
Now analysing: 개령
Now analysing: 건령
Now analysing: 경령
Now analysing: 계령
Now analysing: 고령
Now analysing: 공령
Now analysing: 관령
Now analysing: 광령
Now analysing: 교령
Now analysing: 구령
Now analysing: 규령
Now analysing: 근령
Now analys

0         {'male': '67', 'female': '24', 'name': '가가'}
1           {'male': '0', 'female': '1', 'name': '간가'}
2           {'male': '0', 'female': '0', 'name': '감가'}
3           {'male': '0', 'female': '1', 'name': '강가'}
4           {'male': '0', 'female': '0', 'name': '개가'}
5          {'male': '92', 'female': '0', 'name': '건가'}
6         {'male': '183', 'female': '5', 'name': '경가'}
7         {'male': '122', 'female': '0', 'name': '계가'}
8         {'male': '109', 'female': '0', 'name': '고가'}
9           {'male': '1', 'female': '0', 'name': '공가'}
10          {'male': '0', 'female': '0', 'name': '관가'}
11         {'male': '49', 'female': '0', 'name': '광가'}
12         {'male': '62', 'female': '4', 'name': '교가'}
13          {'male': '2', 'female': '0', 'name': '구가'}
14          {'male': '1', 'female': '0', 'name': '규가'}
15         {'male': '38', 'female': '0', 'name': '근가'}
16        {'male': '113', 'female': '2', 'name': '기가'}
17          {'male': '4', 'female': '5', 'name': '단가'}
18        

In [54]:
gender_df_1 = pd.DataFrame.from_dict(gender_dict.tolist())

In [55]:
gender_df_1.to_excel('output1.xlsx')

In [56]:
df_names_3 = df_names.head(round(len(df_names)/2)).tail(round(len(df_names)/4))

In [57]:
gender_dict_2 = df_names_3['name'].apply(get_gender_data)

Now analysing: 주리
Now analysing: 준리
Now analysing: 지리
Now analysing: 진리
Now analysing: 차리
Now analysing: 찬리
Now analysing: 창리
Now analysing: 척리
Now analysing: 천리
Now analysing: 초리
Now analysing: 추리
Now analysing: 치리
Now analysing: 탁리
Now analysing: 태리
Now analysing: 하리
Now analysing: 한리
Now analysing: 항리
Now analysing: 해리
Now analysing: 현리
Now analysing: 형리
Now analysing: 호리
Now analysing: 환리
Now analysing: 황리
Now analysing: 회리
Now analysing: 효리
Now analysing: 후리
Now analysing: 훈리
Now analysing: 희리
Now analysing: 가린
Now analysing: 간린
Now analysing: 감린
Now analysing: 강린
Now analysing: 개린
Now analysing: 건린
Now analysing: 경린
Now analysing: 계린
Now analysing: 고린
Now analysing: 공린
Now analysing: 관린
Now analysing: 광린
Now analysing: 교린
Now analysing: 구린
Now analysing: 규린
Now analysing: 근린
Now analysing: 기린
Now analysing: 단린
Now analysing: 담린
Now analysing: 대린
Now analysing: 도린
Now analysing: 동린
Now analysing: 람린
Now analysing: 량린
Now analysing: 려린
Now analysing: 령린
Now analysing: 로린
Now analys

Now analysing: 탁모
Now analysing: 태모
Now analysing: 하모
Now analysing: 한모
Now analysing: 항모
Now analysing: 해모
Now analysing: 현모
Now analysing: 형모
Now analysing: 호모
Now analysing: 환모
Now analysing: 황모
Now analysing: 회모
Now analysing: 효모
Now analysing: 후모
Now analysing: 훈모
Now analysing: 희모
Now analysing: 가무
Now analysing: 간무
Now analysing: 감무
Now analysing: 강무
Now analysing: 개무
Now analysing: 건무
Now analysing: 경무
Now analysing: 계무
Now analysing: 고무
Now analysing: 공무
Now analysing: 관무
Now analysing: 광무
Now analysing: 교무
Now analysing: 구무
Now analysing: 규무
Now analysing: 근무
Now analysing: 기무
Now analysing: 단무
Now analysing: 담무
Now analysing: 대무
Now analysing: 도무
Now analysing: 동무
Now analysing: 람무
Now analysing: 량무
Now analysing: 려무
Now analysing: 령무
Now analysing: 로무
Now analysing: 리무
Now analysing: 린무
Now analysing: 만무
Now analysing: 명무
Now analysing: 모무
Now analysing: 무무
Now analysing: 미무
Now analysing: 민무
Now analysing: 박무
Now analysing: 반무
Now analysing: 방무
Now analysing: 배무
Now analys

Now analysing: 효박
Now analysing: 후박
Now analysing: 훈박
Now analysing: 희박
Now analysing: 가반
Now analysing: 간반
Now analysing: 감반
Now analysing: 강반
Now analysing: 개반
Now analysing: 건반
Now analysing: 경반
Now analysing: 계반
Now analysing: 고반
Now analysing: 공반
Now analysing: 관반
Now analysing: 광반
Now analysing: 교반
Now analysing: 구반
Now analysing: 규반
Now analysing: 근반
Now analysing: 기반
Now analysing: 단반
Now analysing: 담반
Now analysing: 대반
Now analysing: 도반
Now analysing: 동반
Now analysing: 람반
Now analysing: 량반
Now analysing: 려반
Now analysing: 령반
Now analysing: 로반
Now analysing: 리반
Now analysing: 린반
Now analysing: 만반
Now analysing: 명반
Now analysing: 모반
Now analysing: 무반
Now analysing: 미반
Now analysing: 민반
Now analysing: 박반
Now analysing: 반반
Now analysing: 방반
Now analysing: 배반
Now analysing: 병반
Now analysing: 보반
Now analysing: 복반
Now analysing: 봉반
Now analysing: 부반
Now analysing: 분반
Now analysing: 빈반
Now analysing: 사반
Now analysing: 상반
Now analysing: 서반
Now analysing: 석반
Now analysing: 설반
Now analys

Now analysing: 고보
Now analysing: 공보
Now analysing: 관보
Now analysing: 광보
Now analysing: 교보
Now analysing: 구보
Now analysing: 규보
Now analysing: 근보
Now analysing: 기보
Now analysing: 단보
Now analysing: 담보
Now analysing: 대보
Now analysing: 도보
Now analysing: 동보
Now analysing: 람보
Now analysing: 량보
Now analysing: 려보
Now analysing: 령보
Now analysing: 로보
Now analysing: 리보
Now analysing: 린보
Now analysing: 만보
Now analysing: 명보
Now analysing: 모보
Now analysing: 무보
Now analysing: 미보
Now analysing: 민보
Now analysing: 박보
Now analysing: 반보
Now analysing: 방보
Now analysing: 배보
Now analysing: 병보
Now analysing: 보보
Now analysing: 복보
Now analysing: 봉보
Now analysing: 부보
Now analysing: 분보
Now analysing: 빈보
Now analysing: 사보
Now analysing: 상보
Now analysing: 서보
Now analysing: 석보
Now analysing: 설보
Now analysing: 수보
Now analysing: 순보
Now analysing: 시보
Now analysing: 신보
Now analysing: 아보
Now analysing: 안보
Now analysing: 양보
Now analysing: 연보
Now analysing: 영보
Now analysing: 예보
Now analysing: 오보
Now analysing: 완보
Now analys

Now analysing: 도분
Now analysing: 동분
Now analysing: 람분
Now analysing: 량분
Now analysing: 려분
Now analysing: 령분
Now analysing: 로분
Now analysing: 리분
Now analysing: 린분
Now analysing: 만분
Now analysing: 명분
Now analysing: 모분
Now analysing: 무분
Now analysing: 미분
Now analysing: 민분
Now analysing: 박분
Now analysing: 반분
Now analysing: 방분
Now analysing: 배분
Now analysing: 병분
Now analysing: 보분
Now analysing: 복분
Now analysing: 봉분
Now analysing: 부분
Now analysing: 분분
Now analysing: 빈분
Now analysing: 사분
Now analysing: 상분
Now analysing: 서분
Now analysing: 석분
Now analysing: 설분
Now analysing: 수분
Now analysing: 순분
Now analysing: 시분
Now analysing: 신분
Now analysing: 아분
Now analysing: 안분
Now analysing: 양분
Now analysing: 연분
Now analysing: 영분
Now analysing: 예분
Now analysing: 오분
Now analysing: 완분
Now analysing: 요분
Now analysing: 용분
Now analysing: 우분
Now analysing: 운분
Now analysing: 원분
Now analysing: 위분
Now analysing: 유분
Now analysing: 윤분
Now analysing: 은분
Now analysing: 의분
Now analysing: 이분
Now analysing: 인분
Now analys

Now analysing: 무서
Now analysing: 미서
Now analysing: 민서
Now analysing: 박서
Now analysing: 반서
Now analysing: 방서
Now analysing: 배서
Now analysing: 병서
Now analysing: 보서
Now analysing: 복서
Now analysing: 봉서
Now analysing: 부서
Now analysing: 분서
Now analysing: 빈서
Now analysing: 사서
Now analysing: 상서
Now analysing: 서서
Now analysing: 석서
Now analysing: 설서
Now analysing: 수서
Now analysing: 순서
Now analysing: 시서
Now analysing: 신서
Now analysing: 아서
Now analysing: 안서
Now analysing: 양서
Now analysing: 연서
Now analysing: 영서
Now analysing: 예서
Now analysing: 오서
Now analysing: 완서
Now analysing: 요서
Now analysing: 용서
Now analysing: 우서
Now analysing: 운서
Now analysing: 원서
Now analysing: 위서
Now analysing: 유서
Now analysing: 윤서
Now analysing: 은서
Now analysing: 의서
Now analysing: 이서
Now analysing: 인서
Now analysing: 자서
Now analysing: 재서
Now analysing: 저서
Now analysing: 적서
Now analysing: 전서
Now analysing: 정서
Now analysing: 제서
Now analysing: 종서
Now analysing: 주서
Now analysing: 준서
Now analysing: 지서
Now analysing: 진서
Now analys

Now analysing: 분순
Now analysing: 빈순
Now analysing: 사순
Now analysing: 상순
Now analysing: 서순
Now analysing: 석순
Now analysing: 설순
Now analysing: 수순
Now analysing: 순순
Now analysing: 시순
Now analysing: 신순
Now analysing: 아순
Now analysing: 안순
Now analysing: 양순
Now analysing: 연순
Now analysing: 영순
Now analysing: 예순
Now analysing: 오순
Now analysing: 완순
Now analysing: 요순
Now analysing: 용순
Now analysing: 우순
Now analysing: 운순
Now analysing: 원순
Now analysing: 위순
Now analysing: 유순
Now analysing: 윤순
Now analysing: 은순
Now analysing: 의순
Now analysing: 이순
Now analysing: 인순
Now analysing: 자순
Now analysing: 재순
Now analysing: 저순
Now analysing: 적순
Now analysing: 전순
Now analysing: 정순
Now analysing: 제순
Now analysing: 종순
Now analysing: 주순
Now analysing: 준순
Now analysing: 지순
Now analysing: 진순
Now analysing: 차순
Now analysing: 찬순
Now analysing: 창순
Now analysing: 척순
Now analysing: 천순
Now analysing: 초순
Now analysing: 추순
Now analysing: 치순
Now analysing: 탁순
Now analysing: 태순
Now analysing: 하순
Now analysing: 한순
Now analys

In [58]:
gender_df_2 = pd.DataFrame.from_dict(gender_dict_2.tolist())

In [59]:
gender_df_2.to_excel('output2.xlsx')

In [60]:
gender_df_2

Unnamed: 0,female,male,name
0,352,96,주리
1,2,38,준리
2,19,22,지리
3,328,106,진리
4,17,26,차리
5,5,7,찬리
6,1,2,창리
7,0,10,척리
8,2,1,천리
9,13,3,초리


In [31]:
df_names_4 = df_names.head(round(len(df_names)*(3/4))).tail(round(len(df_names)/4))

In [40]:
gender_dict_3 = df_names_4['name'].apply(get_gender_data)

Now analysing: 안아
Now analysing: 양아
Now analysing: 연아
Now analysing: 영아
Now analysing: 예아
Now analysing: 오아
Now analysing: 완아
Now analysing: 요아
Now analysing: 용아
Now analysing: 우아
Now analysing: 운아
Now analysing: 원아
Now analysing: 위아
Now analysing: 유아
Now analysing: 윤아
Now analysing: 은아
Now analysing: 의아
Now analysing: 이아
Now analysing: 인아
Now analysing: 자아
Now analysing: 재아
Now analysing: 저아
Now analysing: 적아
Now analysing: 전아
Now analysing: 정아
Now analysing: 제아
Now analysing: 종아
Now analysing: 주아
Now analysing: 준아
Now analysing: 지아
Now analysing: 진아
Now analysing: 차아
Now analysing: 찬아
Now analysing: 창아
Now analysing: 척아
Now analysing: 천아
Now analysing: 초아
Now analysing: 추아
Now analysing: 치아
Now analysing: 탁아
Now analysing: 태아
Now analysing: 하아
Now analysing: 한아
Now analysing: 항아
Now analysing: 해아
Now analysing: 현아
Now analysing: 형아
Now analysing: 호아
Now analysing: 환아
Now analysing: 황아
Now analysing: 회아
Now analysing: 효아
Now analysing: 후아
Now analysing: 훈아
Now analysing: 희아
Now analys

Now analysing: 위영
Now analysing: 유영
Now analysing: 윤영
Now analysing: 은영
Now analysing: 의영
Now analysing: 이영
Now analysing: 인영
Now analysing: 자영
Now analysing: 재영
Now analysing: 저영
Now analysing: 적영
Now analysing: 전영
Now analysing: 정영
Now analysing: 제영
Now analysing: 종영
Now analysing: 주영
Now analysing: 준영
Now analysing: 지영
Now analysing: 진영
Now analysing: 차영
Now analysing: 찬영
Now analysing: 창영
Now analysing: 척영
Now analysing: 천영
Now analysing: 초영
Now analysing: 추영
Now analysing: 치영
Now analysing: 탁영
Now analysing: 태영
Now analysing: 하영
Now analysing: 한영
Now analysing: 항영
Now analysing: 해영
Now analysing: 현영
Now analysing: 형영
Now analysing: 호영
Now analysing: 환영
Now analysing: 황영
Now analysing: 회영
Now analysing: 효영
Now analysing: 후영
Now analysing: 훈영
Now analysing: 희영
Now analysing: 가예
Now analysing: 간예
Now analysing: 감예
Now analysing: 강예
Now analysing: 개예
Now analysing: 건예
Now analysing: 경예
Now analysing: 계예
Now analysing: 고예
Now analysing: 공예
Now analysing: 관예
Now analysing: 광예
Now analys

Now analysing: 정요
Now analysing: 제요
Now analysing: 종요
Now analysing: 주요
Now analysing: 준요
Now analysing: 지요
Now analysing: 진요
Now analysing: 차요
Now analysing: 찬요
Now analysing: 창요
Now analysing: 척요
Now analysing: 천요
Now analysing: 초요
Now analysing: 추요
Now analysing: 치요
Now analysing: 탁요
Now analysing: 태요
Now analysing: 하요
Now analysing: 한요
Now analysing: 항요
Now analysing: 해요
Now analysing: 현요
Now analysing: 형요
Now analysing: 호요
Now analysing: 환요
Now analysing: 황요
Now analysing: 회요
Now analysing: 효요
Now analysing: 후요
Now analysing: 훈요
Now analysing: 희요
Now analysing: 가용
Now analysing: 간용
Now analysing: 감용
Now analysing: 강용
Now analysing: 개용
Now analysing: 건용
Now analysing: 경용
Now analysing: 계용
Now analysing: 고용
Now analysing: 공용
Now analysing: 관용
Now analysing: 광용
Now analysing: 교용
Now analysing: 구용
Now analysing: 규용
Now analysing: 근용
Now analysing: 기용
Now analysing: 단용
Now analysing: 담용
Now analysing: 대용
Now analysing: 도용
Now analysing: 동용
Now analysing: 람용
Now analysing: 량용
Now analys

Now analysing: 초원
Now analysing: 추원
Now analysing: 치원
Now analysing: 탁원
Now analysing: 태원
Now analysing: 하원
Now analysing: 한원
Now analysing: 항원
Now analysing: 해원
Now analysing: 현원
Now analysing: 형원
Now analysing: 호원
Now analysing: 환원
Now analysing: 황원
Now analysing: 회원
Now analysing: 효원
Now analysing: 후원
Now analysing: 훈원
Now analysing: 희원
Now analysing: 가위
Now analysing: 간위
Now analysing: 감위
Now analysing: 강위
Now analysing: 개위
Now analysing: 건위
Now analysing: 경위
Now analysing: 계위
Now analysing: 고위
Now analysing: 공위
Now analysing: 관위
Now analysing: 광위
Now analysing: 교위
Now analysing: 구위
Now analysing: 규위
Now analysing: 근위
Now analysing: 기위
Now analysing: 단위
Now analysing: 담위
Now analysing: 대위
Now analysing: 도위
Now analysing: 동위
Now analysing: 람위
Now analysing: 량위
Now analysing: 려위
Now analysing: 령위
Now analysing: 로위
Now analysing: 리위
Now analysing: 린위
Now analysing: 만위
Now analysing: 명위
Now analysing: 모위
Now analysing: 무위
Now analysing: 미위
Now analysing: 민위
Now analysing: 박위
Now analys

Now analysing: 환은
Now analysing: 황은
Now analysing: 회은
Now analysing: 효은
Now analysing: 후은
Now analysing: 훈은
Now analysing: 희은
Now analysing: 가의
Now analysing: 간의
Now analysing: 감의
Now analysing: 강의
Now analysing: 개의
Now analysing: 건의
Now analysing: 경의
Now analysing: 계의
Now analysing: 고의
Now analysing: 공의
Now analysing: 관의
Now analysing: 광의
Now analysing: 교의
Now analysing: 구의
Now analysing: 규의
Now analysing: 근의
Now analysing: 기의
Now analysing: 단의
Now analysing: 담의
Now analysing: 대의
Now analysing: 도의
Now analysing: 동의
Now analysing: 람의
Now analysing: 량의
Now analysing: 려의
Now analysing: 령의
Now analysing: 로의
Now analysing: 리의
Now analysing: 린의
Now analysing: 만의
Now analysing: 명의
Now analysing: 모의
Now analysing: 무의
Now analysing: 미의
Now analysing: 민의
Now analysing: 박의
Now analysing: 반의
Now analysing: 방의
Now analysing: 배의
Now analysing: 병의
Now analysing: 보의
Now analysing: 복의
Now analysing: 봉의
Now analysing: 부의
Now analysing: 분의
Now analysing: 빈의
Now analysing: 사의
Now analysing: 상의
Now analys

Now analysing: 건재
Now analysing: 경재
Now analysing: 계재
Now analysing: 고재
Now analysing: 공재
Now analysing: 관재
Now analysing: 광재
Now analysing: 교재
Now analysing: 구재
Now analysing: 규재
Now analysing: 근재
Now analysing: 기재
Now analysing: 단재
Now analysing: 담재
Now analysing: 대재
Now analysing: 도재
Now analysing: 동재
Now analysing: 람재
Now analysing: 량재
Now analysing: 려재
Now analysing: 령재
Now analysing: 로재
Now analysing: 리재
Now analysing: 린재
Now analysing: 만재
Now analysing: 명재
Now analysing: 모재
Now analysing: 무재
Now analysing: 미재
Now analysing: 민재
Now analysing: 박재
Now analysing: 반재
Now analysing: 방재
Now analysing: 배재
Now analysing: 병재
Now analysing: 보재
Now analysing: 복재
Now analysing: 봉재
Now analysing: 부재
Now analysing: 분재
Now analysing: 빈재
Now analysing: 사재
Now analysing: 상재
Now analysing: 서재
Now analysing: 석재
Now analysing: 설재
Now analysing: 수재
Now analysing: 순재
Now analysing: 시재
Now analysing: 신재
Now analysing: 아재
Now analysing: 안재
Now analysing: 양재
Now analysing: 연재
Now analysing: 영재
Now analys

Now analysing: 단정
Now analysing: 담정
Now analysing: 대정
Now analysing: 도정
Now analysing: 동정
Now analysing: 람정
Now analysing: 량정
Now analysing: 려정
Now analysing: 령정
Now analysing: 로정
Now analysing: 리정
Now analysing: 린정
Now analysing: 만정
Now analysing: 명정
Now analysing: 모정
Now analysing: 무정
Now analysing: 미정
Now analysing: 민정
Now analysing: 박정
Now analysing: 반정
Now analysing: 방정
Now analysing: 배정
Now analysing: 병정
Now analysing: 보정
Now analysing: 복정
Now analysing: 봉정
Now analysing: 부정
Now analysing: 분정
Now analysing: 빈정
Now analysing: 사정
Now analysing: 상정
Now analysing: 서정
Now analysing: 석정
Now analysing: 설정
Now analysing: 수정
Now analysing: 순정
Now analysing: 시정
Now analysing: 신정
Now analysing: 아정
Now analysing: 안정
Now analysing: 양정
Now analysing: 연정
Now analysing: 영정
Now analysing: 예정
Now analysing: 오정
Now analysing: 완정
Now analysing: 요정
Now analysing: 용정
Now analysing: 우정
Now analysing: 운정
Now analysing: 원정
Now analysing: 위정
Now analysing: 유정
Now analysing: 윤정
Now analysing: 은정
Now analys

In [41]:
gender_df_3 = pd.DataFrame.from_dict(gender_dict_3.tolist())

In [42]:
gender_df_3.to_excel('output3.xlsx')