In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin

def download_pdf(url, save_dir='knowledge_base'):
    """下载PDF文件到指定目录"""
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    filename = os.path.join(save_dir, url.split('/')[-1])
    
    if os.path.exists(filename):
        print(f"文件已存在，跳过: {filename}")
        return filename
    
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()  # 如果请求失败，抛出异常
        
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        print(f"成功下载: {filename}")
        return filename
    except Exception as e:
        print(f"下载 {url} 失败: {e}")
        return None

def crawl_pdfs_from_topsec():
    """从TopSec网站爬取所有PDF文件"""
    base_url = "https://www.topsec.com.cn"
    download_page = "https://www.topsec.com.cn/download.html"
    
    session = requests.Session()
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    try:
        print(f"正在访问 {download_page}")
        response = session.get(download_page, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        pdf_links = []
        
        print(soup)
        # 将soup内容保存到本地文件
        with open('topsec_soup.txt', 'w', encoding='utf-8') as f:
            f.write(str(soup))
        print("已将网页内容保存到 topsec_soup.txt")

        # 1.直接查找href属性包含.pdf的链接
        pdf_anchors = soup.find_all('a', href=lambda href: href and href.endswith('.pdf'))
        for anchor in pdf_anchors:
            pdf_url = urljoin(base_url, anchor['href'])
            pdf_links.append(pdf_url)
        
        # 2.查找可能包含PDF链接的下载按钮或特定区域
        download_sections = soup.find_all('div', class_='download-item')  # 根据网站实际结构调整
        for section in download_sections:
            anchors = section.find_all('a')
            for anchor in anchors:
                href = anchor.get('href')
                if href and href.endswith('.pdf'):
                    pdf_url = urljoin(base_url, href)
                    pdf_links.append(pdf_url)
        
        pdf_links = list(set(pdf_links))
        
        print(f"找到 {len(pdf_links)} 个PDF文件")
        
        for i, pdf_url in enumerate(pdf_links):
            print(f"下载进度: {i+1}/{len(pdf_links)}")
            download_pdf(pdf_url)
            time.sleep(1)
        
        print("所有PDF文件下载完成")
    
    except Exception as e:
        print(f"爬取过程中出错: {e}")

if __name__ == "__main__":
    crawl_pdfs_from_topsec()

正在访问 https://www.topsec.com.cn/download.html
<!-- @extends 是继承 -->
<!DOCTYPE html>

<html lang="zh">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge, chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, height=device-height, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no, minimal-ui" name="viewport"/>
<link href="/static/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<meta content="webkit" name="renderer"/>
<meta content="strict-origin-when-cross-origin" name="referrer"/>
<meta content="telephone=no, email=no" name="format-detection"/>
<meta content="portrait" name="oupengmobile"/>
<meta content="portrait" name="screen-orientation"/>
<meta content="portrait" name="x5-orientation"/>
<meta content="true" name="x5-fullscreen"/>
<meta content="app" name="x5-page-mode"/>
<meta content="dark light" name="color-scheme"/>
<meta content="yes" name="apple-touch-fullscreen">
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta c

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
import os
import requests

def download_pdf(url, save_dir='../knowledge_base'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    filename = os.path.join(save_dir, url.split('/')[-1])
    if os.path.exists(filename):
        print(f"文件已存在，跳过: {filename}")
        return filename
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"成功下载: {filename}")
        return filename
    except Exception as e:
        print(f"下载 {url} 失败: {e}")
        return None

def crawl_pdfs_from_topsec():
    base_url = "https://www.topsec.com.cn"
    download_page = "https://www.topsec.com.cn/download.html"
    # chrome_options = Options()
    # chrome_options.add_argument('--headless')
    # chrome_options.add_argument('--disable-gpu')
    # driver = webdriver.Chrome(options=chrome_options)
    # driver.get(download_page)
    time.sleep(5)  # 等待页面渲染
    with open('topsec_soup.txt', 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
    # driver.quit()
    pdf_links = []
    for div in soup.find_all('div', class_='app-download-file'):
        a_tag = div.find('a', href=True)
        if a_tag and a_tag['href'].endswith('.pdf'):
            pdf_url = urljoin(base_url, a_tag['href'])
            pdf_links.append(pdf_url)
    pdf_links = list(set(pdf_links))
    print(f"找到 {len(pdf_links)} 个PDF文件")
    for i, pdf_url in enumerate(pdf_links):
        print(f"下载进度: {i+1}/{len(pdf_links)}")
        download_pdf(pdf_url)
        time.sleep(1)
    print("所有PDF文件下载完成")

crawl_pdfs_from_topsec()

找到 165 个PDF文件
下载进度: 1/165
成功下载: knowledge_base\4f9c6801-393d-4704-9962-e4bea5f39f921642578495073.pdf
下载进度: 2/165
成功下载: knowledge_base\f0b40d7b-3157-4199-9561-118a4058cd0c1642486018939.pdf
下载进度: 3/165
成功下载: knowledge_base\e6058d8a-d196-49cd-b963-821c43c4b50b1642044044720.pdf
下载进度: 4/165
下载 https://www.topsec.com.cn/uploads/2022-01-13/b9b2b992-eb42-4bb5-9b24-7db20a7224571642053653920.pdf 失败: HTTPSConnectionPool(host='www.topsec.com.cn', port=443): Max retries exceeded with url: /uploads/2022-01-13/b9b2b992-eb42-4bb5-9b24-7db20a7224571642053653920.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000024A357B1CD0>, 'Connection to www.topsec.com.cn timed out. (connect timeout=30)'))
下载进度: 5/165
下载 https://www.topsec.com.cn/uploads/2022-01-14/798d58fa-9477-41be-b95f-08741404c3161642147294912.pdf 失败: HTTPSConnectionPool(host='www.topsec.com.cn', port=443): Max retries exceeded with url: /uploads/2022-01-14/798d58fa-9477-41be-b95f-08741404c3161642147294912.pdf