In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import hashlib

In [5]:
# 全局集合，用于存储已访问过的 URL
visited = set()
# 用于存储成功访问的 URL
successful_urls = []
# 用于存储每个页面内容的哈希值，去重使用
content_hashes = set()

# 计算页面内容的哈希值，避免重复
def get_content_hash(content):
    return hashlib.md5(content.encode('utf-8')).hexdigest()

In [6]:
# 定义递归函数，爬取每个页面
def crawl(url, base_url):
    # 如果已经访问过此URL，则跳过
    if url in visited:
        return
    visited.add(url)  # 记录访问过的URL

    # 请求页面内容
    try:
        response = requests.get(url)
        response.raise_for_status()  # 检查请求是否成功
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return

    # 计算页面内容的哈希值
    content_hash = get_content_hash(response.text)

    # 如果该页面内容的哈希值已经存在，说明内容重复，跳过该页面
    if content_hash in content_hashes:
        print(f"Duplicate content found at: {url}, skipping.")
        return

    # 保存该页面内容的哈希值
    content_hashes.add(content_hash)

    # 如果访问成功且内容不重复，保存这个 URL
    successful_urls.append(url)
    print(f"Successfully visited: {url}")

    # 解析页面
    soup = BeautifulSoup(response.text, 'html.parser')

    # 查找所有的链接，使用 'a' 标签
    for link in soup.find_all('a', href=True):
        # 获取链接的完整URL
        href = link.get('href')
        new_url = urljoin(url, href)  # 使用当前页面的 URL 作为基准

        # 检查是否为相同域名的链接
        if urlparse(new_url).netloc == urlparse(base_url).netloc:
            # 避免爬取无效的锚点链接和邮件链接
            if not new_url.endswith('#') and not new_url.startswith('mailto:'):
                # 去除 URL 中的片段（#及其后内容）
                new_url = new_url.split('#')[0]
                # 去除 URL 中的查询参数（?及其后内容），如果不需要可以保留
                new_url = new_url.split('?')[0]
                # 递归爬取新的页面
                crawl(new_url, base_url)

    # 爬取结束后，稍作休眠
    time.sleep(1)  # 避免爬虫过快爬取


In [11]:
start_url = "https://tugraph-db.readthedocs.io/zh-cn/latest/"
crawl(start_url, start_url)

# 将成功访问的 URL 保存到文件
with open("successful_urls.txt", "a", encoding='utf-8') as f:
    for url in successful_urls:
        f.write(url + "\n")

print(f"Successfully saved {len(successful_urls)} unique URLs to 'successful_urls.txt'.")

Successfully saved 206 unique URLs to 'successful_urls.txt'.


In [10]:
start_url = "https://www.oceanbase.com/docs/tugraph-doc-cn"

crawl(start_url, start_url)

# 将成功访问的 URL 保存到文件
with open("successful_urls.txt", "a", encoding='utf-8') as f:
    for url in successful_urls:
        f.write(url + "\n")

print(f"Successfully saved {len(successful_urls)} unique URLs to 'successful_urls.txt'.")

Successfully saved 206 unique URLs to 'successful_urls.txt'.
