In [2]:
# import os
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from datetime import datetime

# -*- coding: utf-8 -*-
# @Time    : 2025.04.05
# @Author  : Lin
# @FileName: pachong.py

'''
使用selenium是为了自动控制浏览器进行操作，模拟人的操作，适用于那些通过传统手段（如 requests、BeautifulSoup）难以抓取的网站。
由于 JavaScript 动态加载内容，传统爬虫请求后拿到的只是“骨架”，数据还没渲染，使用传统的工具没法获取信息，因此使用selenium
'''

# 设置 webdriver_manager 缓存路径为当前目录下的 drivers 文件夹
#目的是为了不每次都重新下载WebDriver，设置缓存路径在项目根目录下的 drivers/。
current_dir = os.getcwd()
cache_path = os.path.join(current_dir, "drivers")

# 确保目录存在
if not os.path.exists(cache_path):
    os.makedirs(cache_path)

# 设置环境变量,确保 webdriver_manager 使用本地缓存
os.environ['WDM_LOCAL'] = '1'
os.environ['WDM_CACHE_PATH'] = cache_path

# 配置 Selenium 使用 Edge
edge_options = Options()
edge_options.add_argument("--headless")  # 无界面模式
edge_options.add_argument("--disable-gpu")
edge_options.add_argument("--no-sandbox")

# 使用更新后的缓存路径安装 Edge WebDriver,使用 EdgeChromiumDriverManager().install() 自动下载适配版本。
service = Service(EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service, options=edge_options)

# 目标 URL（第一页）
base_url = "http://job.mohrss.gov.cn/cjobs/jobinfolist/listJobinfolist?pageNo="

# 最大页数
max_pages = 10000
data = []
record_count = 0  # 用于计数已爬取的记录数

# 创建保存文件的函数
def save_data(data_list, filename_prefix="job_data"):
    if not data_list:
        print("没有数据需要保存")
        return
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") #用于文件名,防止重新运行文件名重复导致文件被覆盖
    filename = f"{filename_prefix}_{timestamp}.csv"
    
    df = pd.DataFrame(data_list, columns=[
        "岗位名称", "月薪", "地区", "招聘单位", "学历要求", "提供住宿", "发布机构", 
        "工作性质", "工作地点", "岗位描述", "单位简介", "联系人", "联系电话", "电子邮箱"
    ])
    
    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print(f"已保存 {len(data_list)} 条数据到 {filename}")

# 爬取多页
for page in range(9701, max_pages + 1):
    url = f"{base_url}{page}"
    if page%100==0:
        print(f"正在爬取第 {page} 页: {url}")
    
    try:
        driver.get(url)  #尝试获取网页
        time.sleep(0.1)  # 减少等待时间到0.1秒
        
        # 获取所有岗位的链接和基本信息，存储起来再处理
        jobs_data = []
        job_blocks = driver.find_elements(By.CLASS_NAME, "list_show")
        
        for job in job_blocks:
            try:
                # 获取岗位名称
                job_name_element = job.find_element(By.CLASS_NAME, "list_con_tit").find_element(By.TAG_NAME, "a")
                job_name = job_name_element.text.strip()
                
                # 获取详情页链接
                job_link = job_name_element.get_attribute("href")
                
                # 获取薪资 & 地区
                salary = job.find_element(By.CLASS_NAME, "jobs_pay").text.strip()
                location = job.find_element(By.CLASS_NAME, "josbs_usetime").text.strip()
                
                # 存储基本信息和链接
                jobs_data.append({
                    "job_name": job_name,
                    "job_link": job_link,
                    "salary": salary,
                    "location": location
                })
                
            except Exception as e:
                # print(f"获取岗位基本信息失败: {e}")
                continue
        
        # 处理完当前页所有基本信息后，再逐个访问详情页
        for job_info in jobs_data:
            try:
                # 访问详情页
                driver.get(job_info["job_link"])
                
                # 等待页面加载,加载时间设置1秒
                WebDriverWait(driver, 1).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "job_name"))
                )
                
                try:
                    # 获取详情页中的实际薪资（更准确）
                    actual_salary = "-"  #设置为-,防止未获取成功导致错误
                    try:
                        actual_salary = driver.find_element(By.CLASS_NAME, "money").text.strip()
                    except:
                        actual_salary = job_info["salary"]  # 如果获取失败，使用列表页的薪资

                    # 获取招聘单位
                    company = "-"
                    try:
                        company_element = driver.find_element(By.XPATH, '//div[@class="job_name_bottom"]/span[1]/span[2]/a')
                        company = company_element.text.strip()
                    except:
                        try:
                            company = driver.find_element(By.XPATH, '//div[@class="job_name_bottom"]/span[1]/span[2]').text.strip()
                        except:
                            pass
                    
                    # 获取学历要求
                    education = "-"
                    try:
                        education = driver.find_element(By.XPATH, '//span[contains(text(), "学历要求")]/following-sibling::span').text.strip()
                    except:
                        pass
                    
                    # 获取提供住宿情况
                    housing = "-"
                    try:
                        housing = driver.find_element(By.XPATH, '//span[contains(text(), "提供住宿")]/following-sibling::span').text.strip()
                    except:
                        pass
                    
                    # 获取发布机构
                    publisher = "-"
                    try:
                        # 使用包含文本的XPath来找到正确的元素
                        publisher_spans = driver.find_elements(By.XPATH, '//span[contains(text(), "发布机构")]')
                        for span in publisher_spans:
                            # 获取父元素的文本，去除"发布机构："部分
                            parent_span = span.find_element(By.XPATH, './..')
                            publisher_text = parent_span.text.strip()
                            if "发布机构：" in publisher_text:
                                publisher = publisher_text.replace("发布机构：", "").strip()
                                break
                    except Exception as e:
                        # print(f"获取发布机构失败: {e}")
                        pass
                    
                    # 获取工作性质
                    job_type = "-"
                    try:
                        job_type = driver.find_element(By.XPATH, '//span[contains(text(), "工作性质")]/following-sibling::span').text.strip()
                    except:
                        pass
                    
                    # 获取工作地点
                    work_location = "-"
                    try:
                        work_location = driver.find_element(By.XPATH, '//span[contains(text(), "工作地点")]/following-sibling::span').text.strip()
                    except:
                        pass
                    
                    # 获取岗位描述
                    job_description = "-"
                    try:
                        job_description = driver.find_element(By.ID, "gwms").text.strip()
                    except:
                        pass
                    
                    # 获取单位简介
                    company_intro = "-"
                    try:
                        # 找到"单位简介"下面的span
                        span_titles = driver.find_elements(By.CLASS_NAME, "span-title")
                        for title_elem in span_titles:
                            if "单位简介" in title_elem.text:
                                # 找到对应的div，并获取内容
                                intro_div = title_elem.find_element(By.XPATH, "following-sibling::div[@class='gwmsDiv']")
                                company_intro = intro_div.text.strip()
                                break
                    except Exception as e:
                        # print(f"获取单位简介失败: {e}")
                        pass
                    
                    # 获取联系人、电话和邮箱 - 回到原始方法
                    contact_name = "-"
                    contact_phone = "-"
                    contact_email = "-"
                    try:
                        # 使用直接索引定位联系人、电话和邮箱元素
                        contact_div = driver.find_element(By.CLASS_NAME, "phone")
                        
                        try:
                            # 获取联系人 - 第一个span元素
                            raw_contact = contact_div.find_element(By.XPATH, './/span[1]').text
                            contact_name = raw_contact.split("：")[-1].strip()
                        except Exception as e:
                            # print(f"获取联系人失败: {e}")
                            pass
                        
                        try:
                            # 获取联系电话 - 第二个span元素
                            raw_phone = contact_div.find_element(By.XPATH, './/span[2]').text
                            contact_phone = raw_phone.split("：")[-1].strip()
                        except Exception as e:
                            # print(f"获取联系电话失败: {e}")
                            pass
                        
                        try:
                            # 获取邮箱 - 第三个span元素
                            raw_email = contact_div.find_element(By.XPATH, './/span[3]').text
                            contact_email = raw_email.split("：")[-1].strip()
                        except Exception as e:
                            # print(f"获取邮箱失败: {e}")
                            pass
                            
                    except Exception as e:
                        # print(f"获取联系信息失败: {e}")
                        pass
                
                except Exception as e:
                    # print(f"详情页数据提取失败: {e}")
                    company, education, housing, publisher, job_type, work_location = "-", "-", "-", "-", "-", "-"
                    job_description, company_intro = "-", "-"
                    contact_name, contact_phone, contact_email = "-", "-", "-"
                    actual_salary = job_info["salary"]  # 使用列表页的薪资
                    
                # 存入数据
                data.append([
                    job_info["job_name"], 
                    actual_salary,
                    job_info["location"],
                    company, 
                    education,
                    housing,
                    publisher,
                    job_type,
                    work_location,
                    job_description,
                    company_intro,
                    contact_name, 
                    contact_phone,
                    contact_email
                ])
                
                record_count += 1  # 增加计数器
                
                # 每爬取2000条数据保存一次
                if record_count % 2000 == 0:
                    save_data(data, f"job_information_part_{record_count // 2000}")
                    print(f"已完成 {record_count} 条数据爬取")
                    # 保存完数据后清空列表，节省内存
                    data = []
                
                time.sleep(0.1)  # 减少等待时间到0.1秒
                
            except Exception as e:
                # print(f"详情页处理失败: {e}")
                continue
                
    except Exception as e:
        # print(f"处理页面 {page} 时出错: {e}")
        # 保存现有数据，避免丢失
        if data:
            save_data(data, f"job_information_error_recovery_page_{page}")
            data = []  # 保存后清空

# 保存剩余的数据（不足2000条的部分）
if data:
    save_data(data, f"job_information_final_part")

# 全部数据保存到一个总文件
# 读取所有部分文件并合并
all_files = [f for f in os.listdir() if f.startswith("job_information_part_") or f.startswith("job_information_final_part") or f.startswith("job_information_error_recovery_page_")]
if all_files:
    all_data = []
    for file in all_files:
        df = pd.read_csv(file, encoding="utf-8-sig")
        all_data.append(df)
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.to_csv("job_information_combined.csv", index=False, encoding="utf-8-sig")
        print(f"已合并所有数据，总计 {len(combined_df)} 条记录")
# 关闭浏览器
driver.quit()

正在爬取第 9800 页: http://job.mohrss.gov.cn/cjobs/jobinfolist/listJobinfolist?pageNo=9800
已保存 2000 条数据到 job_information_part_1_20250407_224102.csv
已完成 2000 条数据爬取
正在爬取第 9900 页: http://job.mohrss.gov.cn/cjobs/jobinfolist/listJobinfolist?pageNo=9900
已保存 2000 条数据到 job_information_part_2_20250407_230728.csv
已完成 4000 条数据爬取
正在爬取第 10000 页: http://job.mohrss.gov.cn/cjobs/jobinfolist/listJobinfolist?pageNo=10000
已保存 1939 条数据到 job_information_final_part_20250407_233334.csv
已合并所有数据，总计 199939 条记录
