# Job planet 채용 공고 scrapping 스크립트

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup

import pandas as pd

import time
from tqdm.notebook import tqdm

In [2]:
baseUrl = 'https://www.jobplanet.co.kr'
scroll_num = 3
major_classification = '개발'
minor_classification = '개발 전체'

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")

In [3]:
# Function for scrolling

def scroll(driver, scroll_limit):
    '''
    Each scroll gets 9 additional references and it takes 1 second of timeout.
    '''
    n = 1
    prev_height = driver.execute_script('return document.body.scrollHeight')
    while n <= scroll_limit:
        print('scrolling', n, '/', scroll_limit)
        n += 1
        driver.execute_script('window.scrollBy(0, 1000)')
        time.sleep(1)
        curr_height = driver.execute_script('return document.body.scrollHeight')
        if curr_height == prev_height:
            break
        prev_height = curr_height
    time.sleep(5)

In [16]:
# Making a list of target hrefs (job_descriptions)

with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver:
  driver.implicitly_wait(5)
  driver.get(baseUrl + '/job')

  WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'jply_checkbox_item')))
  driver.find_element(By.XPATH, "//*[text()='직종']").click()
  driver.find_element(By.XPATH, f"//*[text()='{major_classification}']").click()
  driver.find_element(By.XPATH, f"//*[text()='{minor_classification}']").click()
  driver.find_element(By.XPATH, "//*[text()='적용']").click()

  time.sleep(5)

  scroll(driver, scroll_num) # get (1 + 3) * 9 = 36 job descriptions

  print('collecting', (1 + scroll_num) * 9, 'of hrefs')
  job_descriptions = [a.get_attribute('href') for a in driver.find_elements(By.TAG_NAME, 'a') if a.get_attribute('href') and baseUrl + '/job/search' in a.get_attribute('href')]

print(len(job_descriptions), job_descriptions)

scrolling 1 / 3
scrolling 2 / 3
scrolling 3 / 3
collecting 36 of hrefs
36 ['https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250674', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250673', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250648', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250507', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250499', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250425', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250542', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250588', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250654', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250653', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250651', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250575', 'https://www.jobplanet.co.kr/job/search?posting_ids%5B%5D=1250574', 'https://www.jobplanet.co.kr/job/search?p

In [17]:
# Scrapping recruitment data from all hrefs collected

extracted_data = {
    'href': [],
    'title': [],
    'company_name': [],
    'job_location': [],
    'job_classification': [],
    'due_date': [],
    'work_experience': [],
    'skills': [],
    'company_intro': [],
    'main_task': [],
    'qualification': [],
    'preference': [],
    'detail_description': [],
}


with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver:
    driver.implicitly_wait(5)
    for i in tqdm(range(len(job_descriptions))):
      driver.get(job_descriptions[i])
      WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'job_apply_section')))

      soup = BeautifulSoup(driver.page_source, 'html.parser')
      apply = soup.find('div', 'job_apply_section')
      detail = soup.find('section', 'recruitment-detail')
      summary = detail.find('dl', 'recruitment-summary__dl')

      extracted_data['href'].append(job_descriptions[i])
      extracted_data['title'].append(apply.find('h1', 'ttl').text)
      extracted_data['company_name'].append(apply.find('span', 'company_name').text)
      extracted_data['job_location'].append(apply.find('span', 'job_location').text)
      extracted_data['job_classification'].append(summary.find(text='직무').find_parent('dt').find_next_sibling().text)
      extracted_data['due_date'].append(summary.find(text='마감일').find_parent('dt').findNextSibling().text)
      extracted_data['work_experience'].append(summary.find(text='경력').find_parent('dt').findNextSibling().text)
      extracted_data['skills'].append(summary.find(text='스킬').find_parent('dt').findNextSibling().text)
      extracted_data['company_intro'].append(detail.find('h3', text='기업 소개').find_next_sibling().text)
      extracted_data['main_task'].append(detail.find('h3', text='주요 업무').find_next_sibling().text)
      extracted_data['qualification'].append(detail.find('h3', text='자격 요건').find_next_sibling().text)
      preference = detail.find('h3', text='우대사항')
      if preference:
          extracted_data['preference'].append(preference.find_next_sibling().text)
      else:
          extracted_data['preference'].append('')
      extracted_data['detail_description'].append(detail)


  0%|          | 0/36 [00:00<?, ?it/s]

In [20]:
extracted_df = pd.DataFrame(extracted_data)
extracted_df.head()

Unnamed: 0,href,title,company_name,job_location,job_classification,due_date,work_experience,skills,company_intro,main_task,qualification,preference,detail_description
0,https://www.jobplanet.co.kr/job/search?posting...,ASP.NET 신입 개발자,오엠티아이코리아(주),서울,웹개발,2023.07.09 D-13,"신입, 경력무관","Javascript, ASP.NET, 웹개발, 프론트엔드, 웹서버",OMTI Inc.는 1985년에 설립하여 미국 법조 관련 분야에 ERP 솔루션을 개...,[주요업무]\n• 웹 어플리케이션 개발 및 유지보수\n\n[개발 환경]\n• 프론트...,• 초대졸이상 혹은 졸업예정자\n• 해외여행에 결격사유가 없으신 분\n• 관련 학과...,• ASP.NET MVC 유경험자,"[[[요약], [<dt class=""recruitment-summary__dt""><..."
1,https://www.jobplanet.co.kr/job/search?posting...,ASP.NET 중급개발자,오엠티아이코리아(주),서울,웹개발,2023.07.09 D-13,2년 이상,"Javascript, ASP.NET, 웹개발, 프론트엔드, 웹서버",OMTI Inc.는 1985년에 설립하여 미국 법조 관련 분야에 ERP 솔루션을 개...,[주요업무]\n• 웹 어플리케이션 개발 및 유지보수\n\n[개발 환경]\n• 프론트...,• 초대졸이상 졸업자\n• 해외여행에 결격사유가 없으신 분\n• 동일 직무에 대해 ...,• Kendo UI 유경험자\n• Microsoft Azure 유경험자,"[[[요약], [<dt class=""recruitment-summary__dt""><..."
2,https://www.jobplanet.co.kr/job/search?posting...,인프라 관리 (카드IS 3팀),롯데정보통신(주),서울,네트워크/보안/운영,2023.07.03 D-7,8년 이상,"네트워크, 서버, 보안, 인프라, 모니터링","오늘을 새롭게, 내일을 이롭게. 롯데\n\n롯데와 함께 비전을 공유할 수 있는 유능...","ㆍ인프라 아키텍쳐 설계 및 검토\nㆍ서버, 스토리지, SAN 스위치 운영 및 유지관...","ㆍ경력 8년 이상\nㆍ네트워크, 보안 기본 이해\nㆍ인프라(서버/스토리지) 운영 경...",ㆍOS 및 H/W Trouble shooting 가능\nㆍRedhat Linux 엔...,"[[[요약], [<dt class=""recruitment-summary__dt""><..."
3,https://www.jobplanet.co.kr/job/search?posting...,클라우드 서비스 개발자,(주)클루닉스,서울,클라우드 개발,2023.07.20 D-24 (채용시 마감),4년 이상,"python, aws, Linux, Cloud, shell script, 클라우드,...","""Create Innovation Together, Dream Tomorrow To...","Public Cloud 기반 HPC클라우드 서비스 개발 (PaaS, SaaS)\n\...",[기본]\n대졸(4년) 이상\n\n[우대]\n- 관련 분야 전공자\n- 리눅스 및 ...,,"[[[요약], [<dt class=""recruitment-summary__dt""><..."
4,https://www.jobplanet.co.kr/job/search?posting...,프론트엔드 플랫폼 개발자,(주)클루닉스,서울,백엔드 개발,2023.07.20 D-24 (채용시 마감),3년 이상,"jsp, Javascript, HTML, CSS, Linux, jquery, Php...","""Create Innovation Together, Dream Tomorrow To...",ㆍ클라우드 솔루션 개발\nㆍR&D 및 DeepLearning 플랫폼 개발\nㆍ반응형...,[기본]\n-대졸 (4년) 이상\n\n[우대]\n-해당 직무 경험자 우대\n-관련 ...,,"[[[요약], [<dt class=""recruitment-summary__dt""><..."


In [21]:
extracted_df.to_csv('./extracted.csv', index=False)