In [1]:
import pandas as pd
import numpy as np
import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
fund_df = pd.read_excel(r'D:\ruijian\基金数据\混合型开放式基金\混合型基金名单.xlsx', dtype=str)

# 将 fund_df 分成 10 份
fund_chunks = np.array_split(fund_df, 10)

# 选择第一份进行爬取（你也可以改成 0~9 之间的任意数字来控制处理哪一份）
fund_df_part = fund_chunks[0]

# ---------------------- 配置 ----------------------
output_path = r'D:\ruijian\基金数据\混合型开放式基金\临时文件[0].xlsx'

In [None]:
def clean_illegal_chars(text):
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f]')
    return ILLEGAL_CHARACTERS_RE.sub('', text) if isinstance(text, str) else text

# ---------------------- 初始化 ----------------------
driver_path = ChromeDriverManager().install()
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

# 假设你已有基金列表 DataFrame：df
fund_codes = fund_df_part["基金代码"]
total = len(fund_codes)

# 初始化总数据 DataFrame
df_total = pd.DataFrame(columns=["基金代码", "报告内容"])

# ---------------------- 主循环 ----------------------
for idx, code in enumerate(fund_codes, start=1):
    print(f"正在爬取第 {idx}/{total} 只基金：{code}")

    url = f"https://fundf10.eastmoney.com/jjgg_{code}_3.html"
    driver.get(url)

    all_contents = []
    main_window = driver.current_window_handle

    while True:
        time.sleep(1.5)

        try:
            links = driver.find_elements(By.XPATH, "//*[@id='ggtable']/table/tbody/tr/td[1]/a[1]")

            for link in links:
                try:
                    current_handles = driver.window_handles
                    link.click()
                    time.sleep(2.5)

                    new_window = [h for h in driver.window_handles if h != main_window][0]
                    driver.switch_to.window(new_window)

                    try:
                        content = driver.find_element(By.ID, "jjggzwcontent").text
                        all_contents.append(content)
                    finally:
                        driver.close()
                        driver.switch_to.window(main_window)

                except Exception as e:
                    print(f"[{code}] 链接处理出错: {e}")
                    driver.switch_to.window(main_window)
                    continue

            # 翻页判断
            try:
                next_page = driver.find_element(By.XPATH, "//label[text()='下一页' and not(contains(@class, 'end'))]")
                next_page.click()
            except NoSuchElementException:
                print(f"基金 {code} 提取完成")
                break

        except Exception as e:
            print(f"[{code}] 页面提取出错: {e}")
            break

    # ---------------------- 合并当前基金数据 ----------------------
    if all_contents:
        df_fund = pd.DataFrame(all_contents, columns=["报告内容"])
        df_fund["基金代码"] = code
        df_fund = df_fund[["基金代码", "报告内容"]]
        df_fund = df_fund.applymap(clean_illegal_chars)

        # 合并到总 DataFrame
        df_total = pd.concat([df_total, df_fund], ignore_index=True)

        # 保存当前总数据（覆盖写入）
        df_total.to_excel(output_path, index=False)
    else:
        print(f"基金 {code} 无内容，跳过写入")

# 关闭浏览器
driver.quit()