diff --git a/novel_crawler/README.md b/novel_crawler/README.md index ae604b0..c4ff608 100644 --- a/novel_crawler/README.md +++ b/novel_crawler/README.md @@ -7,7 +7,7 @@ 爬取小说章节内容页面, 保存到本地 最后去阅读小说了 -最新版:[novel_crawler_v.25.06.05.py][5] +最新版:[novel_crawler_v.25.07.06][6] ## 说明 @@ -77,7 +77,7 @@ rules: novel_chapter_content_div: 'div.content' # 小说单段规则(相对于小说章节内容区域) - novel_chapter_content_p: 'p' # 可以是 p, div, span 等 + novel_chapter_content_p: 'p' # 可以是 br, p, div, span 等 # 小说净化内容配置 purify: @@ -121,6 +121,12 @@ python ***.py config.yml ## 日志 +- [novel_crawler_v.25.07.06][6] + - 使用 rich 库增强输出 + - 增加对 br 标签的特殊处理逻辑 + - 排除目录 JS 链接 + - 若干优化 + - [novel_crawler_v.25.06.05.py][5] - 更新 epub 章节渲染模板 @@ -139,8 +145,15 @@ python ***.py config.yml - [novel_crawler_v.24.12.01.py][2] - 第一个版本 -[1]: https://github.com/God-2077/python-code/tree/main/novel_crawler/novel_crawler_v.24.12.01.py -[2]: https://github.com/God-2077/python-code/tree/main/ftp_server/novel_crawler/novel_crawler_v.24.12.01.py -[3]: https://github.com/God-2077/python-code/tree/main/ftp_server/novel_crawler/novel_crawler_v.25.02.03.py -[4]: https://github.com/God-2077/python-code/tree/main/ftp_server/novel_crawler/novel_crawler_v.25.06.02.py -[5]: https://github.com/God-2077/python-code/tree/main/ftp_server/novel_crawler/novel_crawler_v.25.06.05.py + +[1]: novel_crawler_v.24.12.01.py +[2]: novel_crawler_v.24.12.01.py +[3]: novel_crawler_v.25.02.03.py +[4]: novel_crawler_v.25.06.02.py +[5]: novel_crawler_v.25.06.05.py +[6]: novel_crawler_v.25.07.06.py \ No newline at end of file diff --git a/novel_crawler/config.yml b/novel_crawler/config.yml new file mode 100644 index 0000000..b5bdd32 --- /dev/null +++ b/novel_crawler/config.yml @@ -0,0 +1,90 @@ +# 小说爬虫配置(示例) +# 使用方式: python text.py config.yml + +# 基本配置 +basic: + # 小说详情页面的 URL + novel_detail_url: 'https://www.31ec1.lol/read/45159/' + + # 小说章节列表页面的 URL 列表 + # 如果章节列表在小说详情页面,可以直接使用 novel_detail_url + novel_chapter_url: + - 'https://www.31ec1.lol/read/45159/' + + # 小说保存路径,默认为当前目录 + download_path: './downloads' + + # 小说文件的编码(TXT) (utf-8, gbk, gb2312等) + novel_file_encoding: 'utf-8' + + # 输出格式 (txt, epub) + output_format: 'txt' + + # 缩进字符串 + indent_string: ' ' + + # 是否启用调试模式 + debug: True + +# 选择器规则配置 +rules: + # 小说名称 CSS 选择器 + novel_name: 'body > div.book > div.info > h1' + + # 小说作者 CSS 选择器 + novel_author: 'body > div.book > div.info > div.small > span:nth-child(1)' + + # 小说简介 CSS 选择器 + novel_intro: 'body > div.book > div.info > div.intro > dl > dd' + + # 小说章节区域 CSS 选择器 + novel_chapter_div: 'body > div.listmain' + + # 小说单个章节的区域(相对于小说章节区域) + novel_chapter_div_only: 'dd' + + # 小说章节名称(相对于小说章节区域) + novel_chapter_name: 'a' + + # 小说章节 URL(相对于小说章节区域) + novel_chapter_url: 'a' + + # 小说章节内容区域 CSS 选择器 + novel_chapter_content_div: '#chaptercontent' + + # 小说单段规则(相对于小说章节内容区域) + novel_chapter_content_p: 'br' # 可以是 p, div, span 等 + + # 小说净化内容配置 + purify: + # 需要净化的文本列表 + text: + - '广告内容1' + - '广告内容2' + + # 需要净化的正则表达式列表 + re: + - '[\d]{4}-[\d]{2}-[\d]{2}' # 去除日期格式 + - '本章节.*更新' # 去除更新提示 + +# 网络请求配置 +network: + # 请求头设置 + headers: + User-Agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + Referer: 'https://www.31ec1.lol/' + Accept-Language: 'zh-CN,zh;q=0.9' + + # Cookies 设置 + cookies: + # session_id: 'abc123' + # token: 'xyz456' + + # 请求超时时间(秒) + timeout: 5 + + # 失败重试次数 + max_retries: 5 + + # 请求间隔时间(毫秒) + request_interval_ms: 0 \ No newline at end of file diff --git a/novel_crawler/novel_crawler_v.25.07.06.py b/novel_crawler/novel_crawler_v.25.07.06.py new file mode 100644 index 0000000..23f3153 --- /dev/null +++ b/novel_crawler/novel_crawler_v.25.07.06.py @@ -0,0 +1,740 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Date : 2025/02/03 +# @Author : Kissablecho +# @Software: Visual Studio Code +# @Blog : https://blog.ksable.top/ +# @Github : https://github.com/God-2077/ + +import os +import requests +from bs4 import BeautifulSoup +import sys +import time +import re +import signal +import uuid +from ebooklib import epub +import traceback +import yaml +# 导入rich库 +from rich.console import Console +from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn +from rich.table import Table +from rich.panel import Panel +from rich.text import Text +from rich.style import Style +from rich.prompt import Confirm, Prompt + +# 定义全局变量 +if len(sys.argv) < 2: + print("Usage: python script.py config.yml") + sys.exit(1) + +with open(sys.argv[1], 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + +# 将配置映射到原始变量名 +novel_detail_url = config['basic']['novel_detail_url'] +novel_chapter_url = config['basic']['novel_chapter_url'] + +rule_novel_name = config['rules']['novel_name'] +rule_novel_author = config['rules']['novel_author'] +rule_novel_intro = config['rules']['novel_intro'] +rule_novel_chapter_div = config['rules']['novel_chapter_div'] +rule_novel_chapter_div_only = config['rules']['novel_chapter_div_only'] +rule_novel_chapter_name = config['rules']['novel_chapter_name'] +rule_novel_chapter_url = config['rules']['novel_chapter_url'] +rule_novel_chapter_content_div = config['rules']['novel_chapter_content_div'] +rule_novel_chapter_content_p = config['rules']['novel_chapter_content_p'] + +rule_novel_chapter_content_purify_text = config['rules']['purify']['text'] +rule_novel_chapter_content_purify_re = config['rules']['purify']['re'] + +download_path = config['basic']['download_path'] +novel_file_encoding = config['basic']['novel_file_encoding'] +output_format = config['basic']['output_format'] +indent_string = config['basic']['indent_string'] + +headers = config['network']['headers'] +cookies = config['network']['cookies'] +timeout = config['network']['timeout'] +max_retries = config['network']['max_retries'] +request_interval_ms = config['network']['request_interval_ms'] + +debug = config['basic']['debug'] + +# 创建全局Console对象 +console = Console() +last_request_time = None + +# 函数定义 + +def get_unique_file_path(file_path): + base_path, ext = os.path.splitext(file_path) + counter = 1 + while os.path.exists(file_path): + file_path = f"{base_path}({counter}){ext}" + counter += 1 + return file_path + +def console_log(text, level='log', end='\n'): + """使用rich库增强的日志输出函数""" + localtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + + # 定义日志级别及其对应的样式 + log_styles = { + 'log': Style(color="blue", bold=False), + 'info': Style(color="green", bold=False), + 'warn': Style(color="yellow", bold=True), + 'error': Style(color="red", bold=True), + 'debug': Style(color="magenta", bold=False) + } + + # 日志标签映射 + log_labels = { + 'log': '[LOG]', + 'info': '[INFO]', + 'warn': '[WARN]', + 'error': '[ERROR]', + 'debug': '[DEBUG]' + } + + if level in log_styles: + label = log_labels[level] + style = log_styles[level] + # 使用Text对象组合不同样式的文本 + output_text = Text(f"{localtime} ") + output_text.append(label.ljust(8), style=style) + output_text.append(f": {text}") + console.print(output_text, end=end) + else: + # 未知级别处理 + unknown_label = f"[{level.upper()}]".ljust(8) + console.print(f"{localtime} {unknown_label}: {text}", end=end) + +# 网络请求 +def get_url(url): + global last_request_time + # 毫秒级的时间戳 + if last_request_time is not None: + current_time = int(time.time() * 1000) + if current_time - last_request_time < request_interval_ms: + time.sleep((current_time - last_request_time) / 1000) + retries = 0 + while retries < max_retries: + try: + if cookies and cookies != {}: # 当cookies存在时使用 + response = requests.get(url, headers=headers, timeout=timeout, cookies=cookies) + else: + response = requests.get(url, headers=headers, timeout=timeout) + if debug: + console_log(f"请求成功[{response.status_code}]: {url}", level='debug') + return response + except (requests.exceptions.Timeout, requests.exceptions.RequestException) as e: + console_log(f"请求失败,正在重试 ({retries + 1}/{max_retries})...", level='warn') + if debug: + console_log(f"错误信息: URL({url}) {e}", level='debug') + time.sleep(1) + retries += 1 + console_log("请求失败,已达到最大重试次数。", level='error') + return None + +# 解析网页内容,返回 BeautifulSoup 对象 +def parse_html(html_content): + soup = BeautifulSoup(html_content, 'lxml') + return soup + +# css 选择器,返回解析后的内容 +def css_select(soup, rule): + try: + return soup.select(rule) + except AttributeError: + console_log(f"找不到符合规则 {rule} 的内容", level='error') + return [] + +# 退出程序 +def exit_program(status_code=0): + console_log("程序退出", level='info') + sys.exit(status_code) + +# 安全的文件名 +def safe_filename(filename): + return re.sub(r'[\\/:*?"<>|]', '', filename) + +# 追加写入文件 +def write_file_a(text, path, encoding='utf-8'): + try: + with open(path, 'a', encoding=encoding) as file_a: + file_a.write(text) + file_a.close() + except Exception as e: + console_log(f"写入文件失败: {e}", level='error') + exit_program(1) + +# 净化内容 +def purify_content(content): + if not content: + return '' + # 处理文本替换 + for text in rule_novel_chapter_content_purify_text: + content = content.replace(text, '') + # 处理正则替换 + for pattern in rule_novel_chapter_content_purify_re: + content = re.sub(pattern, '', content) + # 新增:移除空白段落 + content = re.sub(r'\n\s*\n', '\n\n', content) + return content + +# 首页url +def get_base_url(url): + # 去掉任何末尾的路径部分,只保留域名和最后的路径部分 + url = re.sub(r'/[^/]*$', '/', url) + return url + +# url_root +def get_root_url(url): + # 正则表达式匹配协议和域名部分 + match = re.match(r'(https?://[^/]+)', url) + if match: + return match.group(1) + return None + +# 更人性化的时间格式 +def format_time(seconds): + """ + 将秒数转换为更人性化的时间格式(时:分:秒) + """ + hours = int(seconds // 3600) + seconds %= 3600 + minutes = int(seconds // 60) + seconds = int(seconds % 60) + if hours > 0: + return f"{hours} 小时 {minutes} 分钟 {seconds} 秒" + elif minutes > 0: + return f"{minutes} 分钟 {seconds} 秒" + else: + return f"{seconds} 秒" + +# 信号处理函数 +def signal_handler(sig, frame): + console_log("\n程序被中断,正在退出...", level='info') + exit_program(1) + +# epub 章节html渲染 +def epub_chapter_html_render(chapter_name, chapter_content): + """ + 渲染章节内容为 HTML 格式 + """ + return f""" + + + +
+ +书名:{novel_name}
+来源:{novel_detail_url}
+作者:{novel_author}
+简介:
+ {indent_string}{novel_intro}
{chapter_content_p}
\n" + elif output_format == 'txt': + # 对于 txt 格式,使用纯文本 + chapter_content += f"{indent_string}{chapter_content_p}\n" + + if chapter_content == '': + console_log("章节内容为空", level='warn') + else: + # 净化章节内容 + chapter_content = purify_content(chapter_content) + + # 写入文件 + if output_format == 'txt': + write_file_a(f"{chapter_name}\n\n{chapter_content}\n\n", novel_file_path, encoding=novel_file_encoding) + elif output_format == 'epub': + epub_book_chapter_count = int(epub_book_chapter_count) + 1 + epub_book_chapter_count = "{:0>4d}".format(epub_book_chapter_count) + epub_chapter = epub.EpubHtml( + title=chapter_name, + file_name=f"chap_{epub_book_chapter_count}.xhtml", + lang="zh" + ) + epub_chapter.content = epub_chapter_html_render(chapter_name, chapter_content) + epub_book_chapter_listtoc.append((epub_book_chapter_count,chapter_name)) + # 添加到书籍 + epub_book.add_item(epub_chapter) + # 添加到章节列表 + epub_chapter_items.append(epub_chapter) + + # 更新进度条 + progress.update(task_id, advance=1) + + except Exception as e: + console_log(f"章节处理失败: {chapter_name} - {str(e)}", level='error') + console_log("错误详情:", level='debug') + console_log(traceback.format_exc(), level='debug') + progress.update(task_id, advance=1) + continue + + if output_format == 'epub': + # 创建CSS样式 + epub_css_content = """ + body { + font-family: "Microsoft YaHei", "STXihei", sans-serif; + font-size: 1.0em; + line-height: 1.6; + margin: 1em auto; + max-width: 800px; + padding: 0 1em; + text-align: justify; + } + h1 { + text-align: center; + font-size: 1.8em; + margin-top: 2em; + margin-bottom: 1.5em; + border-bottom: 1px solid #ccc; + padding-bottom: 0.5em; + } + h2 { + font-size: 1.4em; + margin-top: 1.8em; + } + p { + text-indent: 2em; + margin: 0.8em 0; + } + ul { + padding-left: 3em; + } + li { + margin: 0.5em 0; + } + """ + + # 创建CSS项目 + epub_style = epub.EpubItem( + uid="style_default", + file_name="style/default.css", + media_type="text/css", + content=epub_css_content + ) + epub_book.add_item(epub_style) + + # 目录 + epub_book.toc = [] + epub_book.toc.append(epub.Link("intro.xhtml", "详情", "intro")) + for (i,title) in epub_book_chapter_listtoc: + epub_book.toc.append(epub.Link(f"chap_{i}.xhtml", title, f"chap_{i}")) + + #添加导航 + epub_book.add_item(epub.EpubNcx()) + epub_book.add_item(epub.EpubNav()) + + # 设置阅读顺序 + # 书脊(阅读顺序):封面、导航、介绍、各章节 + epub_book.spine = ["nav", *epub_chapter_items] + + # 生成文件 + with console.status("[bold green]正在生成EPUB文件..."): + epub.write_epub(novel_file_path,epub_book,{}) + + # 下载用时 + end_time = time.time() + download_time = end_time - start_time + console.print(Panel( + f"[bold green]下载完成![/]\n" + f"小说名称: [bold]{novel_name}[/]\n" + f"作者: [bold]{novel_author}[/]\n" + f"章节数: [bold]{chapter_count}[/]\n" + f"保存路径: [bold]{novel_file_path}[/]\n" + f"用时: [bold]{format_time(download_time)}[/]", + title="下载完成", + border_style="bold green", + expand=False + )) + +if __name__ == '__main__': + # 监听程序退出信号 + signal.signal(signal.SIGINT, signal_handler) + # 运行主函数 + try: + main() + except Exception as e: + console_log(f"程序发生错误: {e}", level='error') + if debug: + console_log("错误详情:", level='debug') + console_log(traceback.format_exc(), level='debug') + exit_program(1) \ No newline at end of file diff --git a/package/config.yml b/package/config.yml index c3cff87..48428c3 100644 --- a/package/config.yml +++ b/package/config.yml @@ -1,10 +1,11 @@ - name: 'novel_crawler' - version: 'v.25.06.05' - python-file: 'novel_crawler\novel_crawler_v.25.06.05.py' + version: 'v.25.07.06' + python-file: 'novel_crawler\novel_crawler_v.25.07.06.py' install-requirements: [ 'beautifulsoup4', 'requests', - 'ebooklib' + 'ebooklib', + 'rich' ] upx: true onefile: 2 # 0:文件夹 1:单文件 2:两者