# 初始化

In [1]:
import os
import time
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from local_packages import *
from dotenv import load_dotenv
from queue import Queue
import concurrent.futures
import random
import json
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed

# 加载环境变量
dotenv_path = os.path.join(os.getcwd(), '.env')
model_path = r'D:\Joining\Models\Text2Vec_base_zh'
# 设置项目根目录和图片目录
project_root = os.path.dirname(dotenv_path)

agentopener=AgentOpener(service_type='qwen')
service=agentopener.service
js=JSProcessor()

服务初始化成功


# PDF识别

In [None]:
import os
import time
import json
import shutil
from concurrent.futures import ThreadPoolExecutor
from PyPDF2 import PdfReader, PdfWriter
from datetime import datetime

# 创建临时文件夹存储所有页的PDF
def create_temp_dir(directory="temp_pdf_pages"):
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

# 按页切分PDF并保存为单独的PDF文件
def split_pdf_by_pages(pdf_file_path, temp_dir):
    pdf_reader = PdfReader(open(pdf_file_path, "rb"))
    extracted_files = []

    for page_num in range(len(pdf_reader.pages)):
        pdf_writer = PdfWriter()
        pdf_writer.add_page(pdf_reader.pages[page_num])
        
        page_pdf_path = os.path.join(temp_dir, f'page_{page_num}.pdf')
        with open(page_pdf_path, "wb") as page_file:
            pdf_writer.write(page_file)
        
        extracted_files.append((page_pdf_path, page_num))
    
    return extracted_files

# 识别PDF文件中的文本并转化为Markdown
def recognize_text_and_convert_to_markdown(pdf_file_path, output_dir):
    print(f"{datetime.now()} - 开始处理: {pdf_file_path}")  # 打印开始时间
    prompt = f'''
    请识别文件中的文字，转化为中文markdown格式并尽量保留其原始格式，以如下json dict结构输出：
    {{
        "markdown": "converted_markdown_here"
    }}
    注意，键名必须是markdown，务必不要重复输出多次同一段解析
    '''
    attempt = 0
    max_attempts = 3

    while attempt < max_attempts:
        try:
            result = service.chat_with_file(pdf_file_path, prompt)
            if result.strip() == "":
                raise ValueError("返回结果为空")
            result_dict = js.parse_dict(result)
            print(result_dict)
            if 'markdown' not in result_dict:
                raise ValueError("返回的字典中缺少期望的键")
            print(f"{datetime.now()} - 完成处理: {pdf_file_path}")  # 打印完成时间
            return result_dict['markdown']
        except json.JSONDecodeError as e:
            print(f"处理 {pdf_file_path} 时出错: 无效的JSON响应 {e}, 尝试 {attempt + 1}")
        except Exception as e:
            if "429" in str(e) or 'Throttling.RateQuota' in str(e):
                print(f"429 或 Throttling.RateQuota 错误 {e}: {pdf_file_path}，将在 {10 * (attempt + 1)} 秒后重试... (尝试 {attempt + 1})")
                time.sleep(10 * (attempt + 1))
            else:
                print(f"处理 {pdf_file_path} 时出错: {e}, 尝试 {attempt + 1}")
        attempt += 1
    raise Exception(f"无法处理文件 {pdf_file_path}")

# 多线程处理所有PDF页
def process_all_pages(extracted_files, output_dir, thread_count):
    with ThreadPoolExecutor(max_workers=thread_count) as executor:
        futures = {executor.submit(recognize_text_and_convert_to_markdown, file, output_dir): index for file, index in extracted_files}
        for future in futures:
            index = futures[future]
            try:
                markdown_content = future.result()
                with open(os.path.join(output_dir, f'page_{index}.md'), 'w', encoding='utf-8') as md_file:
                    md_file.write(markdown_content)
            except Exception as e:
                print(f"处理页 {index} 时出错: {e}")

# 合并所有Markdown块
def merge_markdown_blocks(output_dir, markdown_output_path):
    markdown_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.md')], key=lambda x: int(x.split('_')[1].split('.')[0]))
    final_markdown = []
    for md_file in markdown_files:
        with open(os.path.join(output_dir, md_file), 'r', encoding='utf-8') as f:
            final_markdown.append(f.read())
    with open(markdown_output_path, 'w', encoding='utf-8') as md_file:
        md_file.write("\n\n".join(final_markdown))

# 清空临时文件夹
def clear_temp_dir(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)

# 主函数
def main(pdf_file_path, markdown_output_path, thread_count=5):
    temp_dir = create_temp_dir()
    output_dir = create_temp_dir("temp_markdown_pages")
    extracted_files = split_pdf_by_pages(pdf_file_path, temp_dir)
    process_all_pages(extracted_files, output_dir, thread_count)
    merge_markdown_blocks(output_dir, markdown_output_path)
    
    # 清空临时文件夹
    clear_temp_dir(temp_dir)
    clear_temp_dir(output_dir)

# 示例文件路径
pdf_file_path = "D:\Joining\Joining-Agents0626_Full\Python编程：从入门到实践.pdf"
markdown_output_path = "output_markdown.md"

# 运行主函数
main(pdf_file_path, markdown_output_path, thread_count=50)