# 라이브러리 설치 & 함수 정의

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd

import time
import copy

In [2]:
# url 주소 변동 없는 경우 -> 창 띄워서 클릭 필요함
def crawling_df(url, seconds=1):
    options = Options()
    options.add_argument("--no-sandbox")

    dataframe = pd.DataFrame(columns=['category', 'sub_category', 'title', 'contents', 'url'])

    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(seconds)

    category_name_list = driver.find_elements(By.TAG_NAME, "h2")
    category_name_list = [x.text for x in category_name_list if x.text != '']

    print('category_name : ', category_name_list)

    # 섹션 2~4까지
    # 1은 banking 업무, 보안 프로그램 설치 필요함
    for section_num in range(2, 5):
        # 다시 메인화면 이동
        driver.get(url)
        time.sleep(seconds)
        
        category_name = category_name_list[section_num-1]
        print(f"### {category_name} section crawling start, section_num : {section_num} ###")

        # 메인화면 상태 -> 각 섹션으로 이동 (js로 직접 트리거)
        section_element = driver.find_element(By.CSS_SELECTOR, f"#kebWrapper > div.block0{section_num} > a")
        
        action = ActionChains(driver)
        action.move_to_element(section_element).perform()

        driver.execute_script(f'document.querySelector("#kebWrapper > div.block0{section_num}").focus();')
        time.sleep(seconds)

        # sub_categories = driver.find_element(By.CSS_SELECTOR, f"#kebWrapper > div.block0{section_num}.onFocus > div > ul").find_elements(By.TAG_NAME, "li")
        # sub category 몇 개인지 받아오기
        sub_categories = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, f"#kebWrapper > div.block0{section_num}.onFocus > div > ul"))
        ).find_elements(By.TAG_NAME, "li")

        print('### sub category elements : ', len(sub_categories))

        # 하위 카테고리 순회
        # 인덱스로 해줘야 dom 요소 가능할듯
        for idx in range(1, len(sub_categories)+1):
            # 다시 메인화면 이동
            driver.get(url)
            time.sleep(seconds)

            # 메인화면 상태 -> 각 섹션으로 이동 (js로 직접 트리거)
            section_element = driver.find_element(By.CSS_SELECTOR, f"#kebWrapper > div.block0{section_num} > a")
            
            action = ActionChains(driver)
            action.move_to_element(section_element).perform()

            driver.execute_script(f'document.querySelector("#kebWrapper > div.block0{section_num}").focus();')
            time.sleep(seconds)

            sub_category = driver.find_element(By.CSS_SELECTOR, f"#kebWrapper > div.block0{section_num}.onFocus > div > ul > li:nth-child({idx})")
            
            sub_category_name = sub_category.find_element(By.TAG_NAME, "img").get_attribute("alt")
            print('### sub category name : ', sub_category_name)

            # security center, branch guide 예외처리
            if sub_category_name == 'Security Center' or sub_category_name == 'Branch Guide':
                print("## 예외처리 ##")
                continue
            
            time.sleep(seconds)
            sub_category.find_element(By.TAG_NAME, "a").click()
            time.sleep(seconds)

            # 하위 페이지 리스트
            # ul element
            sub_list_elements = driver.find_elements(By.CSS_SELECTOR, "#lnb > li")
            print('### list length : ', len(sub_list_elements))
            
            for sub_list in sub_list_elements:
                # 서브 리스트 클릭
                sub_list_element = sub_list.find_element(By.TAG_NAME, "a")
                if sub_list_element.get_attribute("href").startswith("/easyone") or sub_list_element.get_attribute("href").startswith("https://"):
                    print("## URL 이동 -> 크롤링 생략 ##")
                    continue
                else:
                    sub_list_element.click() 


                # 하위 페이지가 있는가?
                try:
                    sub_list.find_element(By.TAG_NAME, "ul").find_elements(By.TAG_NAME, "li")
                
                except:
                    print('## No sub page ##')
                    row = {'category' : category_name, 'sub_category': sub_category_name, 'title' : '', 'contents' : '', 'url' : ''}
                    row['title'] = driver.find_element(By.TAG_NAME, "h4").text
                    row['contents'] = driver.find_element(By.ID, "HANA_CONTENTS_DIV").text
                    row['url'] = sub_list_element.get_attribute("href")
                    print('## row : ', row)

                    dataframe.loc[len(dataframe)] = row
                
                # 하위페이지 있을 경우 순회
                else:
                    print('## sub page exists ##')
                    content_pages = sub_list.find_element(By.TAG_NAME, "ul").find_elements(By.TAG_NAME, "li")
                    for page in content_pages:
                        row = {'category' : category_name, 'sub_category': sub_category_name, 'title' : '', 'contents' : '', 'url' : ''}

                        try:
                            # 하위 페이지 클릭
                            time.sleep(seconds)
                            page_element = page.find_element(By.TAG_NAME, "a")

                            # 하위페이지가 다른 URL로 이동한다면 클릭 X, 다른 페이지로
                            if page_element.get_attribute("href").startswith("/easyone") or page_element.get_attribute("href").startswith("https://"):
                                print("## URL 이동 -> 크롤링 생략 ##")
                                continue
                            else:
                                page_element.click()
                                row['url'] = page_element.get_attribute("href")

                            time.sleep(seconds)

                            # 컨텐츠에 다른 탭이 있다면?
                            # 다른탭 없는 경우
                            if len(driver.find_elements(By.CSS_SELECTOR, "#tabMenuDiv > li")) == 0:
                                print('## no tab ##')
                                row['title'] = driver.find_element(By.TAG_NAME, "h4").text
                                row['contents'] = driver.find_element(By.ID, "HANA_CONTENTS_DIV").text
                                print('## row : ', row)

                                dataframe.loc[len(dataframe)] = row
                            
                            # 다른 탭 있다면
                            else:
                                print('## tab exists ##')
                                for tab in driver.find_elements(By.CSS_SELECTOR, "#tabMenuDiv > li"):
                                    row_cp = copy.deepcopy(row)
                                    tab.click()
                                    row_cp['title'] = driver.find_element(By.TAG_NAME, "h4").text
                                    row_cp['contents'] = driver.find_element(By.ID, "HANA_CONTENTS_DIV").text

                                    print('## row : ', row_cp)

                                    dataframe.loc[len(dataframe)] = row_cp
                                

                        except Exception as e:
                            print(f"Error processing sub_page : {e}")

        
    return dataframe


In [3]:
hana_members = crawling_df("https://www.kebhana.com/easyone_index_en.html", 1)
hana_members

category_name :  ['Banking', 'Product/Service', 'Useful Information', 'User Guide']
### Product/Service section crawling start, section_num : 2 ###
### sub category elements :  2
### sub category name :  Product Guide
### list length :  2
## sub page exists ##
## no tab ##
## row :  {'category': 'Product/Service', 'sub_category': 'Product Guide', 'title': 'Deposit Account Product Guide', 'contents': 'HOME\n> Product/Service > Deposit Account > Deposit Account Product Guide\nDeposit Account Product Guide\nTake a glance at the variety of deposit/savings accounts offered by KEB Hana Bank.\nProduct Product Features Details/Sign-up\nEasy-One Pack Account\nMoney market deposit account (MMDA) for foreign customers\nElectronic banking and ATM fees are waived for customers\nwho use a check card or salary transfers.\nView Details\nEasy-One Pack Savings Account\nInstallment product specially for foreign customers\nPreferential interest rates offered based on transaction history\nAll foreign curre

Unnamed: 0,category,sub_category,title,contents,url
0,Product/Service,Product Guide,Deposit Account Product Guide,HOME\n> Product/Service > Deposit Account > De...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
1,Product/Service,Product Guide,Easy-One Pack Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
2,Product/Service,Product Guide,Easy-One Pack Savings Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
3,Product/Service,Product Guide,Easy-One Pack Savings Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
4,Product/Service,Product Guide,Easy-One Pack Time Deposit Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
...,...,...,...,...,...
155,User Guide,Service Hours,Guide to Use Procedures,HOME\n> Use Guide > Guide to Use Procedures\nG...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
156,User Guide,Service Hours,Guide to the issuance of digital certificate,HOME\n> Use Guide > Certificate Use Guide > Gu...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
157,User Guide,Service Hours,Smart Banking Certificate Issuance Guide,HOME\n> Use Guide > Certificate Use Guide > Sm...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
158,User Guide,Service Hours,Smart Banking Certificate Issuance Guide,HOME\n> Use Guide > Certificate Use Guide > Sm...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...


In [4]:
hana_members

Unnamed: 0,category,sub_category,title,contents,url
0,Product/Service,Product Guide,Deposit Account Product Guide,HOME\n> Product/Service > Deposit Account > De...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
1,Product/Service,Product Guide,Easy-One Pack Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
2,Product/Service,Product Guide,Easy-One Pack Savings Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
3,Product/Service,Product Guide,Easy-One Pack Savings Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
4,Product/Service,Product Guide,Easy-One Pack Time Deposit Account,HOME\n> Product/Service > Deposit Account > Ea...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
...,...,...,...,...,...
155,User Guide,Service Hours,Guide to Use Procedures,HOME\n> Use Guide > Guide to Use Procedures\nG...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
156,User Guide,Service Hours,Guide to the issuance of digital certificate,HOME\n> Use Guide > Certificate Use Guide > Gu...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
157,User Guide,Service Hours,Smart Banking Certificate Issuance Guide,HOME\n> Use Guide > Certificate Use Guide > Sm...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...
158,User Guide,Service Hours,Smart Banking Certificate Issuance Guide,HOME\n> Use Guide > Certificate Use Guide > Sm...,javascript:pbk.web.util.goAjaxMenu('/easyone/c...


In [5]:
hana_members.to_csv('./data/hana_members.csv', index=False)