In [None]:
from bs4 import BeautifulSoup
import requests as r
import pickle
import re

### Scrape English and Korean addresses
- The website used is jusoga.com

In [None]:
url = "https://www.jusoga.com"
response = r.get(url)
response.status_code
html = response.text
soup = BeautifulSoup(html, "html.parser")
print(soup)

In [None]:
provinces_tags = soup.find_all("a", {"href" : re.compile(r'jusoga\.com/')})
province = [provinces.text for provinces in provinces_tags[:-1]] # Last item is not province

In [None]:
province

In [None]:
cities_tags = []
for name in province:
    new_path = f"{url}/{name}"
    response = r.get(new_path)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, "html.parser")
        city_tags = soup.find_all("a", {"href" : re.compile(name)}) 
        cities_tags.extend(city_tags[:-1])
# get urls of each city
cities_urls = [url.get("href") for url in cities_tags]

In [None]:
# cities_urls

In [None]:
regions_tags = []
for url in cities_urls:
    response = r.get(url)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, "html.parser")
        region_tags = soup.find_all("a", {"href" : re.compile(url)}) 
        regions_tags.extend(region_tags[:-1])
# get urls of each region
regions_urls = [url.get("href") for url in regions_tags]

In [None]:
with open("url_jusoga", "wb") as f:
    pickle.dump(regions_urls, f)

In [None]:
with open("url_jusoga", "rb") as f:
    regions_urls = pickle.load(f)

In [None]:
# regions_urls

## Multiprocessing
- Refer to `webscraping.py`

## Preprocessing:
- English pairs are reversed in order
- Create a dictionary in which the keys are the korean address token and the values is the corresponding English

In [None]:
# Process English tokens:
# Assume the first word is always the address number to remove
def preprocess_en(address):
    splitted = [word.strip() for word in address.split(',')]
    splitted = splitted[1:]
    return splitted[::-1]

In [None]:
preprocess_en('310-2, 3·15-daero, Masanhappo-gu, Changwon-si, Gyeongsangnam-do')

In [None]:
# Preprocess Korean tokens:
# Remove whatever is inside the parentheses, and remove the numbers)
def preprocess_kr(address):
    substituted = re.sub(r'\(.+\)', '', address)
    substituted = re.sub(r'지하 ', '', substituted)
    substituted = re.sub(r'광역시', '', substituted) # Remove 광역시 as it is not implemented in English
    substituted = re.sub(r'특별시', '', substituted) # Remove 특별시 as it is not implemented in English
    substituted = re.sub(r'특별자치', '', substituted) # Remove [제주|세종]특별자치[시|도]as it is sejong-si and jeju-do in English
    # print(substituted)
    splitted = substituted.strip().split(' ')
    splitted = [word.strip() for word in splitted]
    return splitted[:-1]

In [None]:
preprocess_kr('경기도 과천시 별양로 지하 177 (별양동)')

In [None]:
def generate_dict(list_of_pairs):
    result = {}
    for pair in list_of_pairs:
        kor_addr, en_addr = pair
        pp_kor_addr = preprocess_kr(kor_addr)
        pp_en_addr = preprocess_en(en_addr)
        assert(len(pp_kor_addr) == len(pp_en_addr))
        for i in range(len(pp_kor_addr)):
            result[pp_kor_addr[i]] = pp_en_addr[i]
    return result

In [None]:
result_dict = generate_dict(pairs)

In [None]:
# result_dict

In [None]:
len(result_dict)

In [None]:
# Change of plan: Remove the keys that have numbers in it. Readd them to the dictionary
remove_list = [key for key in result_dict.keys() if any(char.isdigit() for char in key)]

In [4]:
# remove_list

In [None]:
def edit_and_append(addr, result_dict):
    subsituted_kr = re.sub('[0-9]+.*', '', addr).strip()
    splitted_en = result_dict[addr].split()[0].strip()
    result_dict[subsituted_kr] = splitted_en
    return

In [None]:
for i in range(len(remove_list)):
    edit_and_append(remove_list[i], result_dict)

In [None]:
len(result_dict)

In [None]:
# Remove the old keys
for i in range(len(remove_list)):
    del result_dict[remove_list[i]]

In [None]:
len(result_dict)

In [None]:
with open("result_dict.pkl", "wb") as f:
    pickle.dump(result_dict, f)

In [None]:
with open("result_dict.pkl", "rb") as f:
    result_dict = pickle.load(f)

In [None]:
result_dict

### Old Data

In [None]:
# Old Data
with open("/home/tyson/Private/Confirmed/coding/romanization/kor_list.pkl", "rb") as f:
    kor_list = pickle.load(f)

In [None]:
with open("/home/tyson/Private/Confirmed/coding/romanization/eng_list.pkl", "rb") as f:
    eng_list = pickle.load(f)

In [None]:
kor_list_pro = []
for item in kor_list:
    new = re.sub(r'특별자치', '', item)
    new = re.sub(r'특별시|광역시', '', new)
    kor_list_pro.append(new)

In [None]:
assert(len(kor_list_pro) == len(eng_list))

In [None]:
result_dict2 = {}
for i in range(len(kor_list_pro)):
    result_dict2[kor_list_pro[i]] = eng_list[i]

In [None]:
len(result_dict2.keys())

# Merge the two dicts

In [None]:
result_dict_final = result_dict | result_dict2

In [None]:
len(result_dict_final.keys())

In [None]:
for key in result_dict_final.keys():
    for char in key:
        if 65 <= ord(char) <= 122:
            print(key)

In [None]:
for key in result_dict_final.keys():
    for char in key:
        if 0 <= ord(char) <= 64:
            print(key)

In [None]:
del result_dict_final["APEC로"]

In [None]:
with open("final_dict.pkl", "wb") as f:
    pickle.dump(result_dict_final, f)

In [None]:
with open("final_dict.pkl", "rb") as f:
    result_dict_final = pickle.load(f)

In [None]:
for key in result_dict_final.keys():
    for char in key:
        if 65 <= ord(char) <= 122:
            print(key)