# Creating the Dataset

In [92]:
import tabula
import pandas as pd
import numpy as np
import re

In [4]:
hanja_file = 'hanja.pdf'
list_hanja = tabula.read_pdf(hanja_file, pages = "all", multiple_tables = True)
list_hanja

[      0                                            1
 0   NaN                     한문 교육용 인명용 추가 한자 및 허용 한자
 1    한글                                         기초한자
 2   NaN                          (2007.8.현재) 별표1 별표2
 3   NaN                 家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸
 4     가                                          NaN
 5   NaN                                      加價假架暇 珈
 6   NaN                                        各角脚閣却
 7     각                                    珏恪殼愨 愨(慤)
 8   NaN                                           覺刻
 9   NaN                                        干間看刊肝
 10    간                 艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)
 11  NaN                                         幹簡姦懇
 12    갈                                  渴 葛乫喝曷碣竭褐蝎鞨
 13  NaN                                        甘減感敢監
 14    감                          勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑(鑒)
 15  NaN                                            鑑
 16    갑                                      甲 鉀匣岬胛閘
 17  NaN  江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌

In [5]:
len(list_hanja)

53

In [6]:
type(list_hanja)

list

In [7]:
list_hanja[0]

Unnamed: 0,0,1
0,,한문 교육용 인명용 추가 한자 및 허용 한자
1,한글,기초한자
2,,(2007.8.현재) 별표1 별표2
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸
4,가,
5,,加價假架暇 珈
6,,各角脚閣却
7,각,珏恪殼愨 愨(慤)
8,,覺刻
9,,干間看刊肝


In [8]:
type(list_hanja[0])

pandas.core.frame.DataFrame

The list_hanja is a list made up of 53 dataframes, we need to combine these and clean it up.

In [11]:
df_hanja = pd.DataFrame()

In [12]:
for i in range(len(list_hanja)):
    df_hanja = df_hanja.append(list_hanja[i])
df_hanja

Unnamed: 0,0,1,2,3,4,5
0,,한문 교육용 인명용 추가 한자 및 허용 한자,,,,
1,한글,기초한자,,,,
2,,(2007.8.현재) 별표1 별표2,,,,
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,


There's a lot of superfluous data, so we will remove them.

In [13]:
df_hanja = df_hanja[df_hanja[0] != '한글']
df_hanja = df_hanja[df_hanja[2] != '별표1']
df_hanja = df_hanja[2:]
df_hanja

Unnamed: 0,0,1,2,3,4,5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎),,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


In [14]:
df_hanja = df_hanja.rename(columns={0:'hangul', 1:'hanja', 2:'hanja2', 3:'hanja3', 4:'hanja4', 5:'hanja5'})

In [15]:
df_hanja.head(25)

Unnamed: 0,hangul,hanja,hanja2,hanja3,hanja4,hanja5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤),,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎),,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


The Hanja is out of step with the corresponding Hangul, we will fix this next. It seems part of the Hanja appears above and below the Hangul resulting in a NaN. We have to do this manually because of the inconsistency

In [16]:
df_hanja.iat[1,1] = str(df_hanja.iat[0,1]) + str(df_hanja.iat[2,1])
df_hanja.iat[4,1] += str(df_hanja.iat[3,1]) + str(df_hanja.iat[5,1])
df_hanja.iat[7,1] += str(df_hanja.iat[6,1]) + str(df_hanja.iat[8,1])
df_hanja.iat[11,1] += str(df_hanja.iat[10,1]) + str(df_hanja.iat[12,1])
df_hanja.iat[15,1] = str(df_hanja.iat[14,1]) + str(df_hanja.iat[16,1])
df_hanja.iat[18,1] += str(df_hanja.iat[17,1]) + str(df_hanja.iat[19,1])
df_hanja

Unnamed: 0,hangul,hanja,hanja2,hanja3,hanja4,hanja5
3,,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸,,,,
4,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈,,,,
5,,加價假架暇 珈,,,,
6,,各角脚閣却,,,,
7,각,珏恪殼愨 愨(慤)各角脚閣却覺刻,,,,
8,,覺刻,,,,
9,,干間看刊肝,,,,
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇,,,,
11,,幹簡姦懇,,,,
12,갈,渴 葛乫喝曷碣竭褐蝎鞨,,,,


Combine all hanja columns

In [17]:
df_hanja = df_hanja[pd.notnull(df_hanja['hangul'])]
df_hanja = df_hanja.replace(np.nan, '', regex=True)
df_hanja['hanja'] += df_hanja['hanja2'] + df_hanja['hanja3'] + df_hanja['hanja4'] + df_hanja['hanja5']
del df_hanja['hanja2'], df_hanja['hanja3'], df_hanja['hanja4'], df_hanja['hanja5']
df_hanja

Unnamed: 0,hangul,hanja
4,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈
7,각,珏恪殼愨 愨(慤)各角脚閣却覺刻
10,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇
12,갈,渴 葛乫喝曷碣竭褐蝎鞨
14,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑(鑒)甘減感敢監鑑
16,갑,甲 鉀匣岬胛閘
18,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強(强) 鋼(鎠) 岡(崗)剛鋼綱...
21,개,价凱愷漑塏愾疥芥豈鎧玠 個(箇) 蓋(盖)改皆個開介慨槪蓋
2,객,客喀
3,갱,更坑粳羹


Finally we need to reset the index

In [18]:
df_hanja = df_hanja.reset_index()
df_hanja = df_hanja.iloc[:,1:3]
df_hanja

Unnamed: 0,hangul,hanja
0,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈
1,각,珏恪殼愨 愨(慤)各角脚閣却覺刻
2,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆(桿) 癇(癎)干間看刊肝幹簡姦懇
3,갈,渴 葛乫喝曷碣竭褐蝎鞨
4,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑(鑒)甘減感敢監鑑
5,갑,甲 鉀匣岬胛閘
6,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強(强) 鋼(鎠) 岡(崗)剛鋼綱...
7,개,价凱愷漑塏愾疥芥豈鎧玠 個(箇) 蓋(盖)改皆個開介慨槪蓋
8,객,客喀
9,갱,更坑粳羹


In [22]:
len(df_hanja)

453

We have 453 name parts, from which we can construct over 200,000 names, this is too computationally expensive for testing purposes. We will assume that the number of hanja correlates with how often the syllable appears in names.

Get rid of brackets around characters

In [35]:
df_hanja = df_hanja.replace('\(', '', regex=True)
df_hanja = df_hanja.replace('\)', '', regex=True)

In [38]:
df_hanja['hanja_count'] = df_hanja['hanja'].apply(len)
df_hanja

Unnamed: 0,hangul,hanja,hanja_count
0,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈,35
1,각,珏恪殼愨 愨慤各角脚閣却覺刻,14
2,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆桿 癇癎干間看刊肝幹簡姦懇,33
3,갈,渴 葛乫喝曷碣竭褐蝎鞨,11
4,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑鑒甘減感敢監鑑,23
5,갑,甲 鉀匣岬胛閘,7
6,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強强 鋼鎠 岡崗剛鋼綱 鏹 襁,44
7,개,价凱愷漑塏愾疥芥豈鎧玠 個箇 蓋盖改皆個開介慨槪蓋,25
8,객,客喀,2
9,갱,更坑粳羹,4


In [40]:
df_hanja['hanja_count'].describe()

count    453.000000
mean      12.997792
std       14.257974
min        1.000000
25%        4.000000
50%        7.000000
75%       18.000000
max       92.000000
Name: hanja_count, dtype: float64

Let's take the top 25th percentile.

In [47]:
df_subset = pd.DataFrame()

In [48]:
df_subset = df_hanja[df_hanja['hanja_count'] >= 18]
df_subset = df_subset.reset_index()
df_subset = df_subset.iloc[:,1:4] 
df_subset

Unnamed: 0,hangul,hanja,hanja_count
0,가,家佳街可歌 嘉嫁稼賈駕伽迦柯呵哥枷珂痂苛茄袈訶跏軻哿嘏舸加價假架暇 珈,35
1,간,艮侃杆玕竿揀諫墾栞奸柬澗磵稈艱癇忓矸 杆桿 癇癎干間看刊肝幹簡姦懇,33
2,감,勘堪瞰坎嵌憾戡柑橄疳紺邯龕玪 鑑鑒甘減感敢監鑑,23
3,강,江降講強康 杠堈岡姜橿彊慷畺疆糠絳羌腔舡薑鱇嫝跭 玒顜茳 強强 鋼鎠 岡崗剛鋼綱 鏹 襁,44
4,개,价凱愷漑塏愾疥芥豈鎧玠 個箇 蓋盖改皆個開介慨槪蓋,25
5,거,去巨居車擧\r距拒據渠遽鉅炬倨据祛踞鋸,18
6,건,建乾件健巾虔楗鍵愆腱蹇騫搴湕踺建䢖 乾漧,20
7,경,京景經庚耕\r敬輕驚慶競\r竟境鏡頃傾\r硬警徑卿倞鯨坰耿炅更梗憬璟瓊擎儆俓涇莖勁逕熲冏勍\...,74
8,계,癸季界計溪\r鷄系係戒械\r繼契桂啓階\r繫誡烓屆悸棨稽谿界堺 谿磎,31
9,고,古故固苦高\r考告枯姑庫\r孤鼓稿顧叩敲皐暠呱尻拷槁沽痼睾羔股膏苽菰藁蠱袴誥\r賈辜錮雇杲鼔...,48


In [49]:
len(df_subset)

116

This is a little more managable, we will contruct all possible 2-syllable names from these. It seems like our assumption was relatively sound, there are many common name particles present in this data set.

In [94]:
df_subset[(df_subset['hangul'] == '수') | (df_subset['hangul'] == '현')]

Unnamed: 0,hangul,hanja,hanja_count
52,수,水手受授首\r守收誰須雖\r愁樹壽數修\r秀囚需帥殊\r隨輸獸睡遂\r垂搜洙琇銖粹穗繡隋髓袖...,92
104,현,現賢玄絃縣\r懸顯見峴晛泫炫玹鉉眩昡絢呟俔睍舷衒弦儇譞怰䧋\r鋗㢺琄嬛娊妶灦㭹顯顕,39


In [84]:
df_names = pd.DataFrame(columns=['syllable_1','syllable_2'])

In [85]:
x = 0

for i in range(len(df_subset)):
    for j in range(len(df_subset)):
        df_names.at[x, 'syllable_1'] = df_subset.at[i,'hangul']
        df_names.at[x, 'syllable_2'] = df_subset.at[j, 'hangul']
        x+=1

df_names.head(10)

Unnamed: 0,syllable_1,syllable_2
0,가,가
1,가,간
2,가,감
3,가,강
4,가,개
5,가,거
6,가,건
7,가,경
8,가,계
9,가,고


In [86]:
len(df_names)

13456

In [91]:
df_names['name'] = df_names['syllable_1'] + df_names['syllable_2']
cols = df_names.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_names = df_names[cols]
df_names.head(10)

Unnamed: 0,name,syllable_1,syllable_2
0,가가,가,가
1,가간,가,간
2,가감,가,감
3,가강,가,강
4,가개,가,개
5,가거,가,거
6,가건,가,건
7,가경,가,경
8,가계,가,계
9,가고,가,고


# Mining Name Data from the Website

In [125]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [123]:
def convert_address(example_char):
    example_string = str(example_char.encode('utf-8'))
    example_string = example_string.replace("\\", "").replace("x","%").replace("'", "").upper()[1:len(example_string)]
    return 'https://koreanname.me/name/' + example_string

In [127]:
convert_address('수현')

'https://koreanname.me/name/%EC%88%98%ED%98%84'

Needed websites (erumy)
- http://www.erumy.com/nameAnalyze/AnalyzeMyName.aspx?name=송시안
- http://www.erumy.com/nameclub/NameResultList.aspx?name=%EC%8B%9C%EC%95%88
- http://www.erumy.com/nameclub/NameSearch.aspx?searchname=%EC%8B%9C%EC%95%88
- http://www.erumy.com/nameclub/NameResultChartYear.aspx?name=%EC%8B%9C%EC%95%88
- http://www.erumy.com/nameclub/NameResultChartLoc.aspx?name=%EC%8B%9C%EC%95%88 [line 78]

In [128]:
page = urlopen('http://www.erumy.com/nameAnalyze/AnalyzeMyName.aspx?name=%ec%86%a1%ec%84%a0%ec%9a%b0')
soup = BeautifulSoup(page, 'html.parser')
soup

<!DOCTYPE html>
<html lang="ko"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/><meta content="이름,선호이름,신생아이름,이름통계,아기이름,작명,선호하는,아기,개명,한국이름" name="keywords"/><meta content="kimkkikki" name="author"/><meta content="https://koreanname.me/static/home.jpg" name="image"/><meta content="한국인의 이름 통계 서비스" name="description"/><meta content="한국인의 이름 통계 서비스" property="og:description"/><meta content="https://koreanname.me/static/home.jpg" property="og:image"/><meta content="a9df23b43bfd8a4b2706a8fc4ec01218e8427938" name="naver-site-verification"/><meta content="54A117C9AC83B1E74B48C93F9A779987" name="msvalidate.01"/><meta content="4DkPnpJFXVbFpfcLWF8cpznQifPoJXx9OE7sVN81MH4" name="google-site-verification"/><link href="/static/favicon.jpg" rel="icon" type="image/x-icon"/><script async="" src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script><script src="//developers.kakao.com/sdk/js/kakao.min.js"></script><script as