In [None]:
import requests, bs4
from bs4 import BeautifulSoup
import json, time, asyncio, aiohttp
from concurrent.futures import ThreadPoolExecutor

In [2]:
req_header = { "user-agent" : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}

In [None]:
async def fetch(session, url, headers=None):
    try:
        async with session.get(url, headers=headers) as response:
            if response.status == 200:
                content = await response.read()
                try:
                    return content.decode('euc-kr')
                except UnicodeDecodeError:
                    try:
                        return content.decode('utf-8')
                    except UnicodeDecodeError:
                        return content.decode('cp949', errors='replace')
            else:
                print(f"에러 코드 = {response.status}, URL: {url}")
                return None
    except Exception as e:
        print(f"요청 오류: {e}, URL: {url}")
        return None

In [None]:
async def process_page_async(id2, page_url, occ1_id_dict, occ2_id_dict, session):
    result = []
    
    try:
        occ1_id, occ2_name = occ2_id_dict.get(id2, (None, "알 수 없는 직업"))
        if occ1_id is None:
            return []
            
        occ1_name = occ1_id_dict.get(occ1_id, "알 수 없는 대분류")
        
        html = await fetch(session, page_url, req_header)
        if html:
            soup = BeautifulSoup(html, "html.parser")
            li_tag_list = soup.select("li.c_col")
            
            job_tasks = []
            for li_tag in li_tag_list:
                first_cell = li_tag.find('div', class_='cell_first')
                company_name = first_cell.find('a').text if first_cell and first_cell.find('a') else ""
                
                mid_cell = li_tag.find('div', class_='cell_mid')
                title_tag = mid_cell.find('a') if mid_cell else None
                
                if title_tag and title_tag.get("href"):
                    title = title_tag.text
                    title_url = title_tag["href"]
                    
                    job_tasks.append(get_employ_info_async(id2, title_url, session))
            
            employ_info_list = await asyncio.gather(*job_tasks)
            
            # 결과 조합
            for i, li_tag in enumerate(li_tag_list):
                if i < len(employ_info_list):
                    first_cell = li_tag.find('div', class_='cell_first')
                    company_name = first_cell.find('a').text if first_cell and first_cell.find('a') else ""
                    
                    mid_cell = li_tag.find('div', class_='cell_mid')
                    title_tag = mid_cell.find('a') if mid_cell else None
                    
                    if title_tag and title_tag.get("href"):
                        title = title_tag.text
                        title_url = title_tag["href"]
                        
                        job_info = {
                            "회사명": company_name,
                            "채용공고": title,
                            "URL": title_url,
                            "대분류": occ1_name,
                            "소분류": occ2_name,
                            **employ_info_list[i]
                        }
                        
                        result.append(job_info)
    except Exception as e:
        print(f"페이지 처리 오류: {e}, occ2_id: {id2}, URL: {page_url}")
    
    return result

In [None]:
def select_occ1_id(soup):
    occ1_id_dict = {}
    for button in soup.select("ul#occ1_div li button[data-item-val]"):
        occ1_id = button.get("data-item-val")
        occ1_name = button.find_next("span").text if button.find_next("span") else ""
        if occ1_id and occ1_name:
            occ1_id_dict[occ1_id] = occ1_name
    return occ1_id_dict


def select_occ2_id(soup, id_list):
    occ2_id_dict = {}
    all_occ2_buttons = soup.select("ul[id^='occ2_ul_'] li button[data-item-val]")
    
    for button in all_occ2_buttons:
        data_val = button.get("data-item-val")
        if data_val and "_" in data_val:
            occ1_id, occ2_id = data_val.split("_")
            if occ1_id in id_list:
                occ2_name = button.find_next("span").text if button.find_next("span") else ""
                occ2_id_dict[occ2_id] = (occ1_id, occ2_name)
    
    return occ2_id_dict

In [6]:
import time

def calculate_page_count(count_str):
    count = int(count_str.replace(",", ""))
    page = ((count - 1) // 30) + 1
    return page

async def get_page_url_list_async(occ2_id_dict):
    count_url = "https://job.incruit.com/s_common/searchjob/v3/searchjob_getcount_ajax.asp?occ2="
    base_url = "https://job.incruit.com/jobdb_list/searchjob.asp?articlecount=30&occ2="
    
    page_url_list = []
    
    async with aiohttp.ClientSession() as session:
        tasks = []
        for occ2_id in occ2_id_dict.keys():
            tasks.append(fetch(session, count_url + occ2_id, req_header))
        
        responses = await asyncio.gather(*tasks)
        
        for i, (occ2_id, response) in enumerate(zip(occ2_id_dict.keys(), responses)):
            if response:
                page_num = calculate_page_count(response)
                new_urls = [(occ2_id, f"{base_url}{occ2_id}&page={page_index}") 
                           for page_index in range(1, page_num + 1)]
                page_url_list.extend(new_urls)
    
    return page_url_list

def first_cell_parser(li_tag):
    first_cell = li_tag.find('div', class_='cell_first') # 기업 이름과 태그
    company = first_cell.find('a')
    return company.text

def mid_cell_parser(li_tag):
    mid_cell = li_tag.find('div', class_='cell_mid') # 채용 공고
    title = mid_cell.find('a')
    href = title["href"]
    return title.text, href

async def get_employ_info_async(id, link, session):
    employ_info = {}
    try:
        html = await fetch(session, link, req_header)
        if html:
            soup = BeautifulSoup(html, "html.parser")
            info_items = soup.select('ul.jc_list li div.txt em')
            
            keys = ['고용형태', '경력', '근무지역', '학력', '급여조건']
            employ_info = {keys[i]: info_items[i].text for i in range(min(len(keys), len(info_items)))}
    except Exception as e:
        print(f"정보 추출 중 오류: {e}, URL: {link}")
    
    return employ_info

In [None]:
async def classify_jobs_async(page_url_list, occ1_id_dict, occ2_id_dict, batch_size=10):
    all_results = []
    
    # 배치 처리로 동시 요청 수 제한
    for i in range(0, len(page_url_list), batch_size):
        batch = page_url_list[i:i+batch_size]
        
        async with aiohttp.ClientSession() as session:
            tasks = [process_page_async(id2, page_url, occ1_id_dict, occ2_id_dict, session) 
                    for id2, page_url in batch]
            
            batch_results = await asyncio.gather(*tasks)
            
            for results in batch_results:
                all_results.extend(results)
        
        print(f"배치 {i//batch_size + 1}/{(len(page_url_list) + batch_size - 1)//batch_size} 완료. 현재까지 {len(all_results)}개 항목 수집")
        
        await asyncio.sleep(1)
    
    job_data = {}
    for job in all_results:
        occ1_name = job.pop("대분류", "알 수 없는 대분류")
        occ2_name = job.pop("소분류", "알 수 없는 직업")
        
        if occ1_name not in job_data:
            job_data[occ1_name] = {}
        
        if occ2_name not in job_data[occ1_name]:
            job_data[occ1_name][occ2_name] = []
        
        job_data[occ1_name][occ2_name].append(job)
    
    return job_data

In [8]:
def save_to_json(data, filename="incruit_jobs.json"):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"{filename}에 데이터가 저장되었습니다.")

In [None]:
async def async_main():
    url = "https://job.incruit.com/jobdb_list/searchjob.asp?occ1=100&occ1=101&occ1=102&occ1=150&occ1=104&occ1=160&occ1=110&occ1=106&occ1=140&occ1=120&occ1=170&occ1=103&occ1=107&occ1=190&occ1=200&occ1=210&occ1=130"
    
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url, req_header)
        
        if html:
            soup = BeautifulSoup(html, "html.parser")
            
            occ1_id_dict = select_occ1_id(soup)
            print(f"대분류(occ1) 개수: {len(occ1_id_dict)}")
            
            occ2_id_dict = select_occ2_id(soup, occ1_id_dict.keys())
            print(f"소분류(occ2) 개수: {len(occ2_id_dict)}")
            
            page_url_list = await get_page_url_list_async(occ2_id_dict)
            print(f"총 페이지 수: {len(page_url_list)}")
            
            print("전체 데이터 수집을 시작합니다...")
            job_data = await classify_jobs_async(page_url_list, occ1_id_dict, occ2_id_dict, batch_size=20)
            
            save_to_json(job_data)
        else:
            print("초기 페이지 접근 오류")

In [None]:
# 주피터 노트북에서 비동기 코드 실행을 위한 방법
import nest_asyncio
nest_asyncio.apply()  # 이벤트 루프 중첩 허용

# 이제 asyncio.run() 사용 가능
asyncio.run(async_main())