In [1]:
from bs4 import BeautifulSoup as bs
import requests as req
from pprint import pprint
import re #正規表達式模組
import os, json
from time import sleep




In [2]:
pip list

Package                      Version
---------------------------- -----------
annotated-types              0.7.0
anthropic                    0.49.0
anyio                        4.9.0
argon2-cffi                  21.3.0
argon2-cffi-bindings         21.2.0
asttokens                    3.0.0
async-lru                    2.0.4
attrs                        24.3.0
babel                        2.16.0
backoff                      2.2.1
beautifulsoup4               4.12.3
bleach                       6.2.0
branca                       0.8.1
Brotli                       1.0.9
browser-use                  0.1.40
cachetools                   5.5.2
certifi                      2025.1.31
cffi                         1.17.1
charset-normalizer           3.3.2
colorama                     0.4.6
comm                         0.2.1
debugpy                      1.8.11
decorator                    5.1.1
defusedxml                   0.7.1
distro                       1.9.0
executing                    0.8.3

In [3]:
folderPath = 'project_gutenberg'
if not os.path.exists(folderPath):
    os.makedirs(folderPath)

url = "https://www.gutenberg.org/browse/languages/zh#a32292"

res = req.get(url)

soup = bs(res.text, "lxml")

base = 'https://www.gutenberg.org/'

list_chinese_books = []



In [4]:
list_chinese_books.clear()

pattern = re.compile(r'[\u4E00-\u9FFF]')

for a in soup.select('body > div.container > div.page_content > div.pgdbbylanguage > ul > li.pgdbetext > a[href^="/ebooks"]'):
    text = a.get_text(strip=True)
    if not pattern.search(text):
        continue       

    list_chinese_books.append({'title': a.get_text(),
                           'link': 'https://www.gutenberg.org' + a['href']
                           })

print(list_chinese_books)
print(f'總共抓到了 {len(list_chinese_books)} 本書')


[{'title': '豆棚閒話', 'link': 'https://www.gutenberg.org/ebooks/25328'}, {'title': '戲中戲', 'link': 'https://www.gutenberg.org/ebooks/24225'}, {'title': '比目魚', 'link': 'https://www.gutenberg.org/ebooks/24185'}, {'title': '比目魚', 'link': 'https://www.gutenberg.org/ebooks/27119'}, {'title': '三字經', 'link': 'https://www.gutenberg.org/ebooks/12479'}, {'title': '山水情', 'link': 'https://www.gutenberg.org/ebooks/25146'}, {'title': '山海經', 'link': 'https://www.gutenberg.org/ebooks/25288'}, {'title': '施公案', 'link': 'https://www.gutenberg.org/ebooks/23825'}, {'title': '施公案', 'link': 'https://www.gutenberg.org/ebooks/25393'}, {'title': '易經', 'link': 'https://www.gutenberg.org/ebooks/25501'}, {'title': '木蘭奇女傳', 'link': 'https://www.gutenberg.org/ebooks/23938'}, {'title': '海公案', 'link': 'https://www.gutenberg.org/ebooks/54494'}, {'title': '燕丹子', 'link': 'https://www.gutenberg.org/ebooks/24068'}, {'title': '狄公案', 'link': 'https://www.gutenberg.org/ebooks/27686'}, {'title': '百家姓', 'link': 'https://www.gutenbe

In [5]:
def clean_title(raw_title: str) -> str:
    # 1) 移除換行、跳脫字元
    s = raw_title.replace('\r', '').replace('\n', '').strip()
    # 2) 再把 Windows 檔名不允許的字元也去掉
    return re.sub(r'[<>:"/\\|?*]', '', s)

for idx, book in enumerate(list_chinese_books):
    # 1. 先抓 ebook 主頁面
    resp = req.get(book['link'])
    resp.encoding = resp.apparent_encoding
    soup = bs(resp.text, "lxml")
    
    # 2. 找到那個 Read now! 的連結
    read_content = soup.select_one('a.link.read_html')
    if not read_content:
        print(f"{book['title']} 沒有線上閱讀連結")
        continue
    # 3. 建立完整的線上閱讀 URL
    read_url = base + read_content['href']
    list_chinese_books[idx]['read_url'] = read_url 
    
    # 再從這個 dict 取出 read_url
    # 簡單去掉控制字元（換行、跳脫字元等）和 Windows 禁用字元

    # 最後才對那個 URL 發 GET 請求
    read_url = book['read_url']
    resp2 = req.get(read_url)
    resp2.raise_for_status() # 如果有 HTTP 錯誤會拋例外
    resp2.encoding = resp2.apparent_encoding # 設定正確編碼，避免亂碼
    html = resp2.text
    soup2 = bs(html, 'lxml')

    for head in soup2.select('head'):
        head.decompose()

    for body in soup2.select('body>section.pg-boilerplate'):
        body.decompose()

    for p in soup2.select('body > p[id="id00000"]'):
        p.decompose()

    content = str(soup2.select('body>p'))

    chinese_only = re.sub(r"[^\u4E00-\u9FFF]+", "", content)

    # 取出乾淨的標題
    raw_title = book['title']
    safe_title = clean_title(raw_title)
    filename = os.path.join('project_gutenberg', f"{safe_title}.txt")
    # 寫入檔案 (這裡示範把 chinese_only 寫進去)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(chinese_only)
    
    print(f"已寫入：{filename}")

   
# print(list_chinese_books)
print(len(list_chinese_books))



已寫入：project_gutenberg\豆棚閒話.txt
已寫入：project_gutenberg\戲中戲.txt
已寫入：project_gutenberg\比目魚.txt
已寫入：project_gutenberg\比目魚.txt
已寫入：project_gutenberg\三字經.txt
已寫入：project_gutenberg\山水情.txt
已寫入：project_gutenberg\山海經.txt
已寫入：project_gutenberg\施公案.txt
已寫入：project_gutenberg\施公案.txt
已寫入：project_gutenberg\易經.txt
已寫入：project_gutenberg\木蘭奇女傳.txt
已寫入：project_gutenberg\海公案.txt
已寫入：project_gutenberg\燕丹子.txt
已寫入：project_gutenberg\狄公案.txt
已寫入：project_gutenberg\百家姓.txt
已寫入：project_gutenberg\禮記.txt
已寫入：project_gutenberg\綠牡丹.txt
已寫入：project_gutenberg\詩經.txt
已寫入：project_gutenberg\麟兒報.txt
已寫入：project_gutenberg\天豹圖.txt
已寫入：project_gutenberg\梁公九諫.txt
已寫入：project_gutenberg\長恨歌.txt
已寫入：project_gutenberg\李娃傳.txt
已寫入：project_gutenberg\玉樓春.txt
已寫入：project_gutenberg\漢書.txt
已寫入：project_gutenberg\引鳳蕭.txt
已寫入：project_gutenberg\今古奇觀.txt
已寫入：project_gutenberg\後西游記.txt
已寫入：project_gutenberg\飛跎全傳.txt
已寫入：project_gutenberg\佛說四十二章經.txt
已寫入：project_gutenberg\紅樓夢.txt
已寫入：project_gutenberg\洛神賦.txt
已寫入：project_gutenberg\晁氏儒言 一卷.txt