# 初始化

In [None]:
import os
import time
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from local_packages import *
from dotenv import load_dotenv
from queue import Queue
import concurrent.futures
import random
import json
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed

# 加载环境变量
dotenv_path = os.path.join(os.getcwd(), '.env')
model_path = r'D:\Joining\Models\Text2Vec_base_zh'
# 设置项目根目录和图片目录
project_root = os.path.dirname(dotenv_path)

agentopener_glm=AgentOpener(service_type='zhipu',version='glm-4v')
service_glm=agentopener_glm.service
agentopener_qwen=AgentOpener(service_type='deepseek',version='coder')
service_qwen=agentopener_qwen.service
processor=JSProcessor()
PDFRecog=PDFRecog('TempFolder03')



In [None]:
PDFRecog.simp_recog('Demidovich.pdf')

# 高级版本解析

In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

def calculate_distance(rect1, rect2):
    x1, y1, w1, h1 = rect1[:4]
    x2, y2, w2, h2 = rect2[:4]

    # 计算两个矩形的边界
    left1, right1 = x1, x1 + w1
    top1, bottom1 = y1, y1 + h1
    left2, right2 = x2, x2 + w2
    top2, bottom2 = y2, y2 + h2

    # 计算两个矩形之间的最短距离
    if right1 < left2:
        dx = left2 - right1
    elif right2 < left1:
        dx = left1 - right2
    else:
        dx = 0

    if bottom1 < top2:
        dy = top2 - bottom1
    elif bottom2 < top1:
        dy = top1 - bottom2
    else:
        dy = 0

    return np.sqrt(dx**2 + dy**2)

def is_overlapping(rect1, rect2):
    x1, y1, w1, h1 = rect1[:4]
    x2, y2, w2, h2 = rect2[:4]

    # 检查两个矩形是否重叠
    if x1 < x2 + w2 and x1 + w1 > x2 and y1 < y2 + h2 and y1 + h1 > y2:
        return True
    return False

def merge_rectangles(rectangles):
    merged = True
    while merged:
        merged = False
        for i in range(len(rectangles)):
            for j in range(i + 1, len(rectangles)):
                if is_overlapping(rectangles[i], rectangles[j]):
                    x_min = min(rectangles[i][0], rectangles[j][0])
                    y_min = min(rectangles[i][1], rectangles[j][1])
                    x_max = max(rectangles[i][0] + rectangles[i][2], rectangles[j][0] + rectangles[j][2])
                    y_max = max(rectangles[i][1] + rectangles[i][3], rectangles[j][1] + rectangles[j][3])
                    area = (x_max - x_min) * (y_max - y_min)
                    rectangles[i] = (x_min, y_min, x_max - x_min, y_max - y_min, area)
                    del rectangles[j]
                    merged = True
                    break
            if merged:
                break
    return rectangles

def find_bounding_rectangles(image_path):
    # 读取图像并转换为灰度图像
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # 二值化处理（黑底白字）
    _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)

    # 查找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # 创建一个彩色图像来绘制矩形框
    output_image = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)

    # 存储每个轮廓的边界矩形信息
    rectangles = []
    red_rectangles = []

    # 绘制每个轮廓的边界矩形
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        area = w * h
        if area > 10000:
            red_rectangles.append((x, y, w, h, area))
        else:
            rectangles.append((x, y, w, h, area))

    # 合并红色框
    red_rectangles = merge_rectangles(red_rectangles)

    # 删除与红框重叠或在其内部的蓝框
    filtered_rectangles = []
    for rect in rectangles:
        overlap = False
        for red_rect in red_rectangles:
            if is_overlapping(rect, red_rect):
                overlap = True
                break
        if not overlap:
            filtered_rectangles.append(rect)

    return output_image, filtered_rectangles

def find_adjacent_classes(rectangles, distance_threshold=30):
    adjacent_classes = []
    for i in range(len(rectangles)):
        for j in range(i + 1, len(rectangles)):
            if calculate_distance(rectangles[i], rectangles[j]) < distance_threshold:
                found = False
                for cls in adjacent_classes:
                    if i in cls or j in cls:
                        cls.add(i)
                        cls.add(j)
                        found = True
                        break
                if not found:
                    adjacent_classes.append({i, j})

    # 合并相邻类
    merged = True
    while merged:
        merged = False
        for i in range(len(adjacent_classes)):
            for j in range(i + 1, len(adjacent_classes)):
                if adjacent_classes[i].intersection(adjacent_classes[j]):
                    adjacent_classes[i].update(adjacent_classes[j])
                    del adjacent_classes[j]
                    merged = True
                    break
            if merged:
                break

    return adjacent_classes

def find_min_bounding_rect(rectangles, indices):
    x_min = min(rectangles[i][0] for i in indices)
    y_min = min(rectangles[i][1] for i in indices)
    x_max = max(rectangles[i][0] + rectangles[i][2] for i in indices)
    y_max = max(rectangles[i][1] + rectangles[i][3] for i in indices)
    return (x_min, y_min, x_max - x_min, y_max - y_min)

def second_round_merge(min_rects, distance_threshold=1):
    merged = True
    while merged:
        merged = False
        for i in range(len(min_rects)):
            for j in range(i + 1, len(min_rects)):
                if calculate_distance(min_rects[i], min_rects[j]) < distance_threshold:
                    x_min = min(min_rects[i][0], min_rects[j][0])
                    y_min = min(min_rects[i][1], min_rects[j][1])
                    x_max = max(min_rects[i][0] + min_rects[i][2], min_rects[j][0] + min_rects[j][2])
                    y_max = max(min_rects[i][1] + min_rects[i][3], min_rects[j][1] + min_rects[j][3])
                    min_rects[i] = (x_min, y_min, x_max - x_min, y_max - y_min)
                    del min_rects[j]
                    merged = True
                    break
            if merged:
                break
    return min_rects

def filter_small_rectangles(min_rects):
    filtered_rects = []
    for rect in min_rects:
        x, y, w, h = rect
        if w >= 30 and h >= 30 and w * h >= 3000:
            filtered_rects.append(rect)
    return filtered_rects

def display_bounding_rectangles(output_image, rectangles, adjacent_classes, image_path):
    min_rects = []
    for cls in adjacent_classes:
        min_rect = find_min_bounding_rect(rectangles, cls)
        min_rects.append(min_rect)

    min_rects = second_round_merge(min_rects)
    filtered_min_rects = filter_small_rectangles(min_rects)

    # 创建存储目录
    temp_folder = "temppngfolder"
    if not os.path.exists(temp_folder):
        os.makedirs(temp_folder)

    # 从 image_path 中解析出页码信息
    page_info = os.path.basename(image_path).split('.')[0]

    for i, rect in enumerate(filtered_min_rects):
        # 截取蓝色框
        cropped_image = output_image[rect[1]:rect[1] + rect[3], rect[0]:rect[0] + rect[2]]
        cv2.imwrite(os.path.join(temp_folder, f"cropped_{page_info}_box_{i}.png"), cropped_image)

        # 绘制蓝色框
        cv2.rectangle(output_image, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)

    plt.imshow(cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB))
    plt.title('Bounding Rectangles')
    plt.axis('off')
    plt.show()

def process_image(image_path):
    output_image, rectangles = find_bounding_rectangles(image_path)
    adjacent_classes = find_adjacent_classes(rectangles)
    display_bounding_rectangles(output_image, rectangles, adjacent_classes, image_path)

def main():
    folder_path = "D:\\Joining\\Joining-Agents0710_Light\\TempFolder03"
    for filename in os.listdir(folder_path):
        if filename.endswith(".png") or filename.endswith(".jpg"):
            image_path = os.path.join(folder_path, filename)
            print(f"Processing {image_path}")
            process_image(image_path)

if __name__ == "__main__":
    main()

In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

def calculate_distance(rect1, rect2):
    x1, y1, w1, h1 = rect1[:4]
    x2, y2, w2, h2 = rect2[:4]

    # 计算两个矩形的边界
    left1, right1 = x1, x1 + w1
    top1, bottom1 = y1, y1 + h1
    left2, right2 = x2, x2 + w2
    top2, bottom2 = y2, y2 + h2

    # 计算两个矩形之间的最短距离
    if right1 < left2:
        dx = left2 - right1
    elif right2 < left1:
        dx = left1 - right2
    else:
        dx = 0

    if bottom1 < top2:
        dy = top2 - bottom1
    elif bottom2 < top1:
        dy = top1 - bottom2
    else:
        dy = 0

    return np.sqrt(dx**2 + dy**2)

def is_overlapping(rect1, rect2):
    x1, y1, w1, h1 = rect1[:4]
    x2, y2, w2, h2 = rect2[:4]

    # 检查两个矩形是否重叠
    if x1 < x2 + w2 and x1 + w1 > x2 and y1 < y2 + h2 and y1 + h1 > y2:
        return True
    return False

def merge_rectangles(rectangles):
    merged = True
    while merged:
        merged = False
        for i in range(len(rectangles)):
            for j in range(i + 1, len(rectangles)):
                if is_overlapping(rectangles[i], rectangles[j]):
                    x_min = min(rectangles[i][0], rectangles[j][0])
                    y_min = min(rectangles[i][1], rectangles[j][1])
                    x_max = max(rectangles[i][0] + rectangles[i][2], rectangles[j][0] + rectangles[j][2])
                    y_max = max(rectangles[i][1] + rectangles[i][3], rectangles[j][1] + rectangles[j][3])
                    area = (x_max - x_min) * (y_max - y_min)
                    rectangles[i] = (x_min, y_min, x_max - x_min, y_max - y_min, area)
                    del rectangles[j]
                    merged = True
                    break
            if merged:
                break
    return rectangles

def find_bounding_rectangles(image_path):
    # 读取图像并转换为灰度图像
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # 二值化处理（黑底白字）
    _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)

    # 查找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # 创建一个彩色图像来绘制矩形框
    output_image = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)

    # 存储每个轮廓的边界矩形信息
    rectangles = []
    red_rectangles = []

    # 绘制每个轮廓的边界矩形
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        area = w * h
        if area > 10000:
            red_rectangles.append((x, y, w, h, area))
        else:
            rectangles.append((x, y, w, h, area))

    # 合并红色框
    red_rectangles = merge_rectangles(red_rectangles)

    # 清除红色框内的白色像素
    for rect in red_rectangles:
        x, y, w, h, _ = rect
        output_image[y:y+h, x:x+w] = 0  # 将红色框内的像素设置为黑色

    # 删除与红框重叠或在其内部的蓝框
    filtered_rectangles = []
    for rect in rectangles:
        overlap = False
        for red_rect in red_rectangles:
            if is_overlapping(rect, red_rect):
                overlap = True
                break
        if not overlap:
            filtered_rectangles.append(rect)

    return output_image, filtered_rectangles

def find_adjacent_classes(rectangles, distance_threshold=30):
    adjacent_classes = []
    for i in range(len(rectangles)):
        for j in range(i + 1, len(rectangles)):
            if calculate_distance(rectangles[i], rectangles[j]) < distance_threshold:
                found = False
                for cls in adjacent_classes:
                    if i in cls or j in cls:
                        cls.add(i)
                        cls.add(j)
                        found = True
                        break
                if not found:
                    adjacent_classes.append({i, j})

    # 合并相邻类
    merged = True
    while merged:
        merged = False
        for i in range(len(adjacent_classes)):
            for j in range(i + 1, len(adjacent_classes)):
                if adjacent_classes[i].intersection(adjacent_classes[j]):
                    adjacent_classes[i].update(adjacent_classes[j])
                    del adjacent_classes[j]
                    merged = True
                    break
            if merged:
                break

    return adjacent_classes

def find_min_bounding_rect(rectangles, indices):
    x_min = min(rectangles[i][0] for i in indices)
    y_min = min(rectangles[i][1] for i in indices)
    x_max = max(rectangles[i][0] + rectangles[i][2] for i in indices)
    y_max = max(rectangles[i][1] + rectangles[i][3] for i in indices)
    return (x_min, y_min, x_max - x_min, y_max - y_min)

def second_round_merge(min_rects, distance_threshold=1):
    merged = True
    while merged:
        merged = False
        for i in range(len(min_rects)):
            for j in range(i + 1, len(min_rects)):
                if calculate_distance(min_rects[i], min_rects[j]) < distance_threshold:
                    x_min = min(min_rects[i][0], min_rects[j][0])
                    y_min = min(min_rects[i][1], min_rects[j][1])
                    x_max = max(min_rects[i][0] + min_rects[i][2], min_rects[j][0] + min_rects[j][2])
                    y_max = max(min_rects[i][1] + min_rects[i][3], min_rects[j][1] + min_rects[j][3])
                    min_rects[i] = (x_min, y_min, x_max - x_min, y_max - y_min)
                    del min_rects[j]
                    merged = True
                    break
            if merged:
                break
    return min_rects

def filter_small_rectangles(min_rects):
    filtered_rects = []
    for rect in min_rects:
        x, y, w, h = rect
        if w >= 30 and h >= 30 and w * h >= 3000:
            filtered_rects.append(rect)
    return filtered_rects

def display_bounding_rectangles(output_image, rectangles, adjacent_classes, image_path):
    min_rects = []
    for cls in adjacent_classes:
        min_rect = find_min_bounding_rect(rectangles, cls)
        min_rects.append(min_rect)

    min_rects = second_round_merge(min_rects)
    filtered_min_rects = filter_small_rectangles(min_rects)

    # 创建存储目录
    temp_folder = "temppngfolder"
    if not os.path.exists(temp_folder):
        os.makedirs(temp_folder)

    # 从 image_path 中解析出页码信息
    page_info = os.path.basename(image_path).split('.')[0]

    for i, rect in enumerate(filtered_min_rects):
        # 截取蓝色框
        cropped_image = output_image[rect[1]:rect[1] + rect[3], rect[0]:rect[0] + rect[2]]
        cv2.imwrite(os.path.join(temp_folder, f"cropped_{page_info}_box_{i}.png"), cropped_image)

        # 绘制蓝色框
        cv2.rectangle(output_image, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)

    plt.imshow(cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB))
    plt.title('Bounding Rectangles')
    plt.axis('off')
    plt.show()

def process_image(image_path):
    output_image, rectangles = find_bounding_rectangles(image_path)
    adjacent_classes = find_adjacent_classes(rectangles)
    display_bounding_rectangles(output_image, rectangles, adjacent_classes, image_path)

def main():
    folder_path = "D:\\Joining\\Joining-Agents0710_Light\\TempFolder03"
    for filename in os.listdir(folder_path):
        if filename.endswith(".png") or filename.endswith(".jpg"):
            image_path = os.path.join(folder_path, filename)
            print(f"Processing {image_path}")
            process_image(image_path)

if __name__ == "__main__":
    main()

In [None]:
import os
import concurrent.futures
import time
from datetime import datetime
import re
import shutil

def check_markdown_field(returned_dict):
    """
    检查返回的字典是否包含markdown字段

    参数:
    returned_dict (dict): 需要检查的字典

    返回:
    bool: 如果字典包含markdown字段则返回True，否则返回False
    """
    if 'markdown' in returned_dict and isinstance(returned_dict['markdown'], str):
        return True
    return False

def task_processor(pic_path, max_retries=3):
    """
    处理图像文件，识别文字并转换为markdown格式

    参数:
    pic_path (str): 图像文件路径
    max_retries (int): 最大重试次数

    返回:
    str: 转换后的markdown字符串
    """
    prompt_glm = '''
    请识别文件中的文字与数学公式，翻译成中文，全部完整转化为markdown格式代码，并直接返回解析结果，注意不要遗漏等号等数学符号，要保证数学或代码一定保证在markdown环境中可以正常渲染出来，不许在开头或结尾添加任何额外信息：
    '''
    
    prompt_qwen_template = '''
    这里有一段markdown文本，但是其中存在着一些数学语法的错误，比如数学环境$...$不成对，各种括号不成对，现在我请你修复错误，基于原始表达式猜测正确表达式：
    {answer_1}

    以如下json dict结构输出：
        {{
            "markdown": "converted_markdown_here"
        }}
        注意，键名必须是markdown，也稍微注意一下可读性，适当换行和设置格式
    '''
    
    for attempt in range(max_retries):
        try:
            answer_1 = service_glm.ask_pic(prompt_glm, pic_path)
            print('ans1',answer_1)
            prompt_qwen = prompt_qwen_template.format(answer_1=answer_1)
            answer_2 = service_qwen.ask_once(prompt_qwen)
            print('ans2',answer_2)
            answer_dict = processor.parse_dict(answer_2)
            print('ans3',answer_dict)
            
            if check_markdown_field(answer_dict):
                return answer_dict["markdown"]
            else:
                print(f"尝试第 {attempt + 1} 次失败，重试中...")
                
        except Exception as e:
            print(f"处理 {pic_path} 时出错: {e}, 尝试 {attempt + 1}")
            time.sleep(10 * (attempt + 1))

    raise ValueError(f"最大重试次数已达到，但仍未能获取包含markdown字段的有效响应: {pic_path}")

def process_images(temp_folder, temp_markdown_folder, worker_num=10):
    # 清空 temp_markdown_folder 文件夹
    if os.path.exists(temp_markdown_folder):
        shutil.rmtree(temp_markdown_folder)
    os.makedirs(temp_markdown_folder)

    image_files = [os.path.join(temp_folder, f) for f in os.listdir(temp_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
    image_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=worker_num) as executor:
        futures = {executor.submit(task_processor, image_file): (image_file, i) for i, image_file in enumerate(image_files)}
        results = [None] * len(image_files)
        for future in concurrent.futures.as_completed(futures):
            image_file, index = futures[future]
            try:
                result = future.result(timeout=40)  # 设置超时时间为40秒
                results[index] = result
                md_file_path = os.path.join(temp_markdown_folder, os.path.basename(image_file).replace('.png', '.md'))
                with open(md_file_path, "w", encoding="utf-8") as f:
                    f.write(result)
            except concurrent.futures.TimeoutError:
                print(f"处理 {image_file} 超时，强行终止此线程")
            except Exception as e:
                print(f"处理 {image_file} 时出错: {e}")
    
    final_markdown = "\n".join(filter(None, results))
    with open(os.path.join(temp_markdown_folder, "final_markdown.md"), "w", encoding="utf-8") as f:
        f.write(final_markdown)
        
# 示例调用
temp_folder = r"D:\Joining\Joining-Agents0710_Light\TempPngFolder"
temp_markdown_folder = r"D:\Joining\Joining-Agents0710_Light\TempMarkdownFolder03"

if not os.path.exists(temp_markdown_folder):
    os.makedirs(temp_markdown_folder)

process_images(temp_folder, temp_markdown_folder, worker_num=30)


In [None]:
import os
import re

def extract_page_and_box(filename):
    match = re.match(r'cropped_page_(\d+)_img_1_box_(\d+)\.md', filename)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

def merge_markdown_files(folder_path, output_filename):
    # 获取文件夹下所有Markdown文件
    md_files = [f for f in os.listdir(folder_path) if f.endswith('.md')]
    
    # 提取文件名中的page和box信息，并排序
    md_files_with_info = [(f, extract_page_and_box(f)) for f in md_files]
    md_files_with_info = [f for f in md_files_with_info if f[1][0] is not None]
    md_files_with_info.sort(key=lambda x: (x[1][0], x[1][1]))
    
    # 合并文件内容
    merged_content = ""
    for filename, _ in md_files_with_info:
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            merged_content += file.read() + "\n\n"
    
    # 写入合并后的内容到输出文件
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        output_file.write(merged_content)

# 示例用法
folder_path = r'D:\Joining\Joining-Agents0710_Light\TempMarkdownFolder03'
output_filename = 'merged_markdown.md'
merge_markdown_files(folder_path, output_filename)

In [None]:

def replace_symbol_in_markdown(markdown_file_path, new_markdown_file_path, old_symbol, new_symbol):
    """
    读取Markdown文件，替换特定符号，并将结果保存到新的Markdown文件中

    参数:
    markdown_file_path (str): 原始Markdown文件路径
    new_markdown_file_path (str): 新的Markdown文件路径
    old_symbol (str): 需要替换的符号
    new_symbol (str): 替换后的符号
    """
    try:
        # 读取原始Markdown文件内容
        with open(markdown_file_path, 'r', encoding='utf-8') as file:
            md_content = file.read()

        # 替换特定符号
        new_md_content = md_content.replace(old_symbol, new_symbol)

        # 将新的内容写入新的Markdown文件
        with open(new_markdown_file_path, 'w', encoding='utf-8') as file:
            file.write(new_md_content)

        print(f"符号替换完成，新文件已保存到 {new_markdown_file_path}")
    except Exception as e:
        print(f"处理文件时出错: {e}")

# 示例调用
replace_symbol_in_markdown('new_file.md', 'new_file.md', '', 'a')