In [1]:
# 相關套件

import requests
import pandas as pd
from tqdm import tqdm 
from datetime import datetime
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

today = datetime.now().strftime("%Y%m%d")
today

'20250609'

## 獲取職業技能資料

In [None]:
## 取得網站所有職業總覽

# 1. 取得 JSON 資料
url_JobCat = "https://static.104.com.tw/category-tool/json/JobCat.json"

try:
    response = requests.get(url_JobCat)
    response.raise_for_status()
    # 直接將 response 轉為 Python 的 list/dict 結構
    jobcat_data = response.json() 
except requests.RequestException as e:
    print(f"下載失敗: {e}")
    exit()


# 2. 直接將 requests 取得的資料傳入遞迴函式
def flatten_jobcat_recursive(node_list, parent_des=None, parent_no=None):
    flat_list = []
    for node in node_list:
        row = {
            'parent_code': parent_no,
            'parent_name': parent_des,
            'job_code': node.get('no'),
            'job_name': node.get('des')
        }
        flat_list.append(row)
        if 'n' in node and node['n']:
            children_list = flatten_jobcat_recursive(
                node_list=node['n'], 
                parent_des=node.get('des'), 
                parent_no=node.get('no')
            )
            flat_list.extend(children_list)
    return flat_list

# 3. 執行結果轉為 DataFrame
flattened_data = flatten_jobcat_recursive(jobcat_data)
df_jobcat = pd.DataFrame(flattened_data)
df_jobcat = df_jobcat[df_jobcat['parent_code'].notnull()]
df_jobcat_sorted = df_jobcat.sort_values(by='job_code')
df_jobcat_sorted

Unnamed: 0,parent_code,parent_name,job_code,job_name
1,2001000000,經營／人資類,2001001000,經營／幕僚類人員
3,2001001000,經營／幕僚類人員,2001001001,經營管理主管
2,2001001000,經營／幕僚類人員,2001001002,儲備幹部
4,2001001000,經營／幕僚類人員,2001001003,主管特別助理
5,2001001000,經營／幕僚類人員,2001001004,副總經理
...,...,...,...,...
648,2018002000,其他類人員,2018002011,派報生／傳單派送
639,2018002000,其他類人員,2018002012,清潔工／資源回收人員
641,2018002000,其他類人員,2018002013,家事服務人員
646,2018002000,其他類人員,2018002014,汽車美容人員


In [None]:
## 取得網站所有職業技能/工具


# 4. 定義函數來獲取職位資訊

def fetch_job_info(jobCode):
    try:
        url_job = f"https://be.guide.104.com.tw/wow/jobCard/job?jobCode={jobCode}"
        response_job = requests.get(url_job)
        response_job.raise_for_status()  # 檢查請求是否成功
        job_info = response_job.json()

        # 獲取工具和技能資訊
        url_cert = f"https://be.guide.104.com.tw/wow/jobCard/cert?jobCode={jobCode}"
        response_cert = requests.get(url_cert)
        response_cert.raise_for_status()  # 檢查請求是否成功
        cert_info = response_cert.json()

        # 整合兩個 JSON 物件
        combined_info = {
            **job_info,  # 使用字典解包
            'hardToolList': cert_info['hardToolList'],
            'hardSkillList': cert_info['hardSkillList'],
            'hardCertList': cert_info['hardCertList']
        }

        return pd.json_normalize(combined_info)

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # 返回一個空的 DataFrame

# 測試函數獲取資料
# jobCode = '2001001002'
# job_data_df = fetch_job_info(jobCode)
# job_data_df

In [None]:

# 5. 使用 ThreadPoolExecutor 並行獲取所有職業資訊
def fetch_all_job_info(job_codes):
    all_job_data = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_job_info, job_code): job_code for job_code in job_codes}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            job_data = future.result()
            if not job_data.empty:
                all_job_data.append(job_data)
    return pd.concat(all_job_data, ignore_index=True)



# 6. 獲取所有職業資訊，儲存為 CSV 檔案
job_codes = df_jobcat_sorted['job_code'].unique()
job_data_df = fetch_all_job_info(job_codes)
output_file = f"104人力銀行_職業探索_({today}).csv"
job_data_df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"資料已儲存至 {output_file}")

100%|██████████| 636/636 [00:31<00:00, 20.40it/s]

資料已儲存至 104人力銀行_職業探索_(20250609).csv



