# 初始化

In [None]:
import os
import time
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from local_packages import *
from dotenv import load_dotenv
from queue import Queue
import concurrent.futures
import random
import json
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed

# 加载环境变量
dotenv_path = os.path.join(os.getcwd(), '.env')
model_path = r'D:\Joining\Models\Text2Vec_base_zh'
# 设置项目根目录和图片目录
project_root = os.path.dirname(dotenv_path)

agentopener=AgentOpener(service_type='qwen',version='long')
service=agentopener.service
js=JSProcessor()

# PDF识别

In [None]:
import fitz  # PyMuPDF
import os
import shutil

class PDFProcessor:
    def __init__(self, input_path, temp_folder):
        self.input_path = input_path
        self.temp_folder = temp_folder
        self.blocks_info = []
    
    def extract_blocks_info(self):
        # 打开PDF文件
        doc = fitz.open(self.input_path)
        
        self.blocks_info = []  # 用于存储各个block的页数和坐标
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            # 获取页面中的文本块
            text_blocks = page.get_text("dict")["blocks"]
            
            for block in text_blocks:
                if block["type"] == 0:  # 0表示文本块
                    # 存储文本块的页数和坐标
                    self.blocks_info.append({
                        "页数": page_num + 1,
                        "坐标": block["bbox"]
                    })
        
        doc.close()
    
    @staticmethod
    def is_overlapping(bbox1, bbox2):
        """
        判断两个矩形是否重叠
        :param bbox1: 第一个矩形的边界框 (x1_min, y1_min, x1_max, y1_max)
        :param bbox2: 第二个矩形的边界框 (x2_min, y2_min, x2_max, y2_max)
        :return: 如果两个矩形重叠，返回True；否则返回False
        """
        x1_min, y1_min, x1_max, y1_max = bbox1
        x2_min, y2_min, x2_max, y2_max = bbox2
        
        gap = 10
        # 检查x轴上的投影是否有重叠
        x_overlap = x1_min <= x2_max + gap and x1_max >= x2_min - gap
        
        # 检查y轴上的投影是否有重叠
        y_overlap = y1_min <= y2_max + gap and y1_max >= y2_min - gap
        
        # 如果x轴和y轴上的投影都有重叠，则两个矩形重叠
        return x_overlap and y_overlap
    
    def find_overlapping_blocks(self):
        overlapping_groups = []
        visited = set()
        
        for i in range(len(self.blocks_info)):
            if i in visited:
                continue
            group = [i]
            for j in range(i + 1, len(self.blocks_info)):
                if self.blocks_info[i]["页数"] == self.blocks_info[j]["页数"] and self.is_overlapping(self.blocks_info[i]["坐标"], self.blocks_info[j]["坐标"]):
                    group.append(j)
                    visited.add(j)
            if len(group) > 1:
                overlapping_groups.append([self.blocks_info[k] for k in group])
            else:
                visited.add(i)
        
        return overlapping_groups
    
    @staticmethod
    def merge_bounding_boxes(blocks):
        x_min = min(block["坐标"][0] for block in blocks)
        y_min = min(block["坐标"][1] for block in blocks)
        x_max = max(block["坐标"][2] for block in blocks)
        y_max = max(block["坐标"][3] for block in blocks)
        return (x_min, y_min, x_max, y_max)
    
    def update_blocks_info(self):
        overlapping_groups = self.find_overlapping_blocks()
        
        merged_groups = []
        for group in overlapping_groups:
            merged_bbox = self.merge_bounding_boxes(group)
            merged_groups.append({
                "页数": group[0]["页数"],
                "坐标": merged_bbox
            })
            print(f"合并事件: {group} 合并为 {merged_bbox}")
        
        # 更新 blocks_info
        updated_blocks_info = []
        merged_indices = set()
        for group in overlapping_groups:
            for block in group:
                merged_indices.add(self.blocks_info.index(block))
        
        for i, block in enumerate(self.blocks_info):
            if i not in merged_indices:
                updated_blocks_info.append(block)
        
        updated_blocks_info.extend(merged_groups)
        
        self.blocks_info = updated_blocks_info
    
    def process(self):
        self.extract_blocks_info()
        previous_length = len(self.blocks_info)
        
        while True:
            self.update_blocks_info()
            current_length = len(self.blocks_info)
            print(f"当前 blocks_info 长度: {current_length}")
            
            if current_length == previous_length:
                break
            
            previous_length = current_length
        
        # 画出更新后的框
        doc = fitz.open(self.input_path)
        for block in self.blocks_info:
            page = doc.load_page(block["页数"] - 1)
            page.draw_rect(block["坐标"], color=(1, 0, 0))  # 红色框
        doc.save("output.pdf")
        doc.close()
        
        # 清空temp_folder
        if os.path.exists(self.temp_folder):
            shutil.rmtree(self.temp_folder)
        os.makedirs(self.temp_folder)
        
        # 截断出pdf_part_x并存放到temp_folder中
        doc = fitz.open(self.input_path)
        unique_parts = set()
        for i, block in enumerate(self.blocks_info):
            start_page = block["页数"] - 1
            end_page = start_page  # 假设每个block只包含一页
            part_doc = fitz.open()
            part_doc.insert_pdf(doc, from_page=start_page, to_page=end_page)
            part_text = part_doc.load_page(0).get_text("text")
            
            if part_text not in unique_parts:
                unique_parts.add(part_text)
                part_path = os.path.join(self.temp_folder, f"pdf_part_{i + 1}.pdf")
                part_doc.save(part_path)
            
            part_doc.close()
        
        doc.close()
        
        return self.blocks_info

# 示例调用
input_path = r"D:\Joining\Joining-Agents0626_Full\Poly_Optimisation_2024-V01.pdf"
temp_folder = r"D:\Joining\Joining-Agents0626_Full\temp_folder"
processor = PDFProcessor(input_path, temp_folder)
updated_blocks_info = processor.process()
print(len(updated_blocks_info), updated_blocks_info)


In [None]:
import os
import concurrent.futures
import time
from datetime import datetime
import re

def clean_markdown(markdown):
    # 使用正则表达式根据标志性符号分割
    parts = re.split(r'\[\d+\]', markdown)
    unique_parts = []
    seen = set()
    for part in parts:
        stripped_part = part.strip()
        if stripped_part and stripped_part not in seen:
            unique_parts.append(stripped_part)
            seen.add(stripped_part)
    return '\n\n'.join(unique_parts)

def recognize_text_and_convert_to_markdown(pdf_file_path):
    print(f"{datetime.now()} - 开始处理: {pdf_file_path}")
    prompt = '''
    请识别文件中的文字，翻译成中文，转化为markdown格式并尽量保留其原始格式，数学公式需要使用latex语法，以如下json dict结构输出：
    {
        "markdown": "converted_markdown_here"
    }
    注意，键名必须是markdown，数学或代码一定要使用latex语法
    '''
    attempt = 0
    max_attempts = 3
    while attempt < max_attempts:
        try:
            result = service.chat_with_file(pdf_file_path, prompt)
            if result.strip() == "":
                raise ValueError("返回结果为空")
            result_dict = js.parse_dict(result)
            if 'markdown' not in result_dict:
                raise ValueError("返回的字典中缺少期望的键")
            markdown = result_dict['markdown']
            cleaned_markdown = clean_markdown(markdown)
            print(cleaned_markdown)
            return cleaned_markdown
        
        except Exception as e:
            if "RequestTimeOut" in str(e):
                print(f"RequestTimeOut 错误 {e}: {pdf_file_path}，将在 {10 * (attempt + 1)} 秒后重试... (尝试 {attempt + 1})")
                time.sleep(10 * (attempt + 1))
            elif "429" in str(e) or 'Throttling.RateQuota' in str(e):
                print(f"429 或 Throttling.RateQuota 错误 {e}: {pdf_file_path}，将在 {10 * (attempt + 1)} 秒后重试... (尝试 {attempt + 1})")
                time.sleep(10 * (attempt + 1))
            else:
                print(f"处理 {pdf_file_path} 时出错: {e}, 尝试 {attempt + 1}")
        attempt += 1
    raise Exception(f"无法处理文件 {pdf_file_path}")


def process_pdf_parts(temp_folder, temp_markdown_folder, worker_num=50):
    # 清空 temp_markdown_folder 文件夹
    if os.path.exists(temp_markdown_folder):
        shutil.rmtree(temp_markdown_folder)
    os.makedirs(temp_markdown_folder)

    pdf_files = [os.path.join(temp_folder, f) for f in os.listdir(temp_folder) if f.endswith('.pdf')]
    pdf_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=worker_num) as executor:
        futures = {executor.submit(recognize_text_and_convert_to_markdown, pdf_file): (pdf_file, i) for i, pdf_file in enumerate(pdf_files)}
        results = [None] * len(pdf_files)
        for future in concurrent.futures.as_completed(futures):
            pdf_file, index = futures[future]
            try:
                result = future.result(timeout=40)  # 设置超时时间为40秒
                results[index] = result
                with open(os.path.join(temp_markdown_folder, f"temp_markdown_{index + 1}.md"), "w", encoding="utf-8") as f:
                    f.write(result)
            except concurrent.futures.TimeoutError:
                print(f"处理 {pdf_file} 超时，强行终止此线程")
            except Exception as e:
                print(f"处理 {pdf_file} 时出错: {e}")
    
    final_markdown = "\n".join(filter(None, results))
    with open(os.path.join(temp_markdown_folder, "final_markdown.md"), "w", encoding="utf-8") as f:
        f.write(final_markdown)
        
# 示例调用
temp_folder = r"D:\Joining\Joining-Agents0626_Full\temp_folder"
temp_markdown_folder = r"D:\Joining\Joining-Agents0626_Full\temp_markdown_folder"

if not os.path.exists(temp_markdown_folder):
    os.makedirs(temp_markdown_folder)

process_pdf_parts(temp_folder, temp_markdown_folder,40)
