# 初始化

In [None]:
import os
import time
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from local_packages import *
from dotenv import load_dotenv
from queue import Queue
import concurrent.futures
import random
import json
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed

# 加载环境变量
dotenv_path = os.path.join(os.getcwd(), '.env')

# 设置项目根目录和图片目录
project_root = os.path.dirname(dotenv_path)

service_type ='qwen'

def initialize_service(service_type):
    if service_type in ['zhipu', None]:
        version = 'glm-3-turbo'
        #'glm-4' 'glm-4v' 'glm-3-turbo'
        service = GLMService(version)
    elif service_type in ['qwen']:
        version = 'long'
        service=QwenService(version)
    elif service_type in ['kimi']:
        version = '32k'
        #'8k'1M/12￥ '32k'1M/24￥ '128k'1M/60￥
        service = KimiService(version)
    elif service_type in ['deepseek']:
        version = 'chat'
        service = DeepSeekService(version)
    elif service_type in ['huida']:
        version = 'gpt-4o'
        #'8k'1M/12￥ '32k'1M/24￥ '128k'1M/60￥
        service = HuidaService(version)
    elif service_type in ['sensetime']:
        version = 'SenseChat'
        #SenseChat SenseChat-32K SenseChat-128K SenseChat-Turbo SenseChat-FunctionCall
        service = SenseService(version=version)
    else:
        raise ValueError('未知的服务类型')
    
    return service

service = initialize_service(service_type)

js=JSProcessor()

class ParseError(Exception):
    def __init__(self, code, message="解析失败"):
        self.code = code
        self.message = message
        super().__init__(self.message)


In [None]:
service.ask_once("你谁")

# Wrap API

In [None]:
!pip install pyngrok

In [None]:
from flask import Flask, request, jsonify
from threading import Thread
from pyngrok import ngrok

app = Flask(__name__)

# 处理函数示例：反转字符串
def process_string(data_str):
    return data_str[::-1]

@app.route('/process', methods=['POST'])
def process():
    data = request.json
    data_str = data.get('data_str')
    if data_str:
        processed_str = process_string(data_str)
        return jsonify({"processed_str": processed_str})
    else:
        return jsonify({"error": "No data_str provided"}), 400

def run_app():
    app.run()

# 使用线程在后台运行Flask应用
thread = Thread(target=run_app)
thread.start()

# 使用ngrok将本地服务器暴露到公网
public_url = ngrok.connect(5000)
print(f"Public URL: {public_url}")


# 建立工作目录

In [None]:
import os

# 定义根文件夹和模型路径
root_folder = 'Task0_0622_MCM_KG_analyser'
model_path = r'D:\Joining\Models\Text2Vec_base_zh'

pdf_file_path=os.path.join(root_folder,'pdf_files.json')

print("所有目录和文件已成功创建。")


# 提取知识库的各级目录

In [None]:
import json
menu_structure_path=os.path.join(root_folder,"menu_structure.json")
# 假设从 JSON 文件中读取数据
mcm_kg_base = js.read_json(menu_structure_path)

def extract_keys(data, level=0, parent_key=""):
    keys_dict = {}

    def inner_extract(data, level, parent_key):
        if level not in keys_dict:
            keys_dict[level] = []

        for key, value in data.items():
            current_key = parent_key + key if parent_key else key
            keys_dict[level].append(current_key)

            if isinstance(value, dict):
                inner_extract(value, level + 1, current_key + " > ")

    inner_extract(data, level, parent_key)
    return keys_dict

keys_dict = extract_keys(mcm_kg_base["模型"])

for level in sorted(keys_dict.keys()):
    print(f"Level {level}:")
    for key in keys_dict[level]:
        print(f"  {key}")
    print()


# 提取优秀论文目录

In [None]:
import os
import json

def list_pdf_files(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

def save_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

def main():
    directory = "D:\\CUMCM2024\\CUMCM_2012_2023_Best_Papers"
    pdf_files = list_pdf_files(directory)
    save_to_json(pdf_files, 'pdf_files.json')
    print(f"PDF文件路径已保存到 pdf_files.json")

if __name__ == "__main__":
    main()


# 文件解析

In [None]:
file_path_list=js.read_json(pdf_file_path)

In [None]:
import concurrent.futures
import json


# 处理单个 PDF 文件的函数，包含指数退避策略
def process_pdf(file_path):
    max_retries = 3
    retry_delay = 10  # 初始延迟时间
    prompt_1 = f'''
    请你查看输入的这份pdf论文，给出它涉及到的主要学科知识点，用键值对的形式表示，其键必须为["知识点xx"]，值为知识点，每一条知识点可能有多个层级，各层之间用-连接，例如:是lv1-lv2-lv3的格式，以json dict的格式输出，例如:
    {{
        "知识点1":"控制论-控制技巧-根轨迹分析",
        "知识点2":"最优化方法-约束优化",
        "知识点3":"农林环材",
        ...
    }}
    请你注意，lv1必须出自以下这个列表的主题中:[运筹学,  最优化方法,  机器学习,  计算与模拟,  概率统计,  经济管理,  预测主题,  评价主题,  数据科学,  物理学,  生物医学,  物理化学,  信号理论,  农林环材,  控制论]
不许自由发挥
    '''
    
    for attempt in range(max_retries):
        try:
            answer = service.chat_with_file(file_path, prompt_1)
            if any(error in answer for error in ['invalid_parameter_error', 'RequestTimeOut']) and 'please try again later' in answer:
                raise Exception('File parsing in progress or request timed out, please try again later.')
            answer_dict = js.parse_dict(answer)
            answer_dict["文件目录"] = file_path
            print(answer_dict)
            return answer_dict
        except Exception as e:
            error_message = str(e)
            if 'rate_limit_reached_error' in error_message or 'File parsing in progress' in error_message or 'RequestTimeOut' in error_message:
                print(f"Rate limit reached or file parsing/request timeout for {file_path}. Retrying in {retry_delay} seconds.")
                time.sleep(retry_delay)
                retry_delay *= 2  # 指数增加延迟时间
            else:
                print(f"Error processing {file_path}: {e}")
                return None
    
    print(f"Failed to process {file_path} after {max_retries} attempts.")
    return None

# 主函数
def main(pdf_file_path):
    file_path_list = js.read_json(pdf_file_path)
    dict_list = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
        futures = {executor.submit(process_pdf, file_path): file_path for file_path in file_path_list}
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                dict_list.append(result)
    return dict_list
dict_list = main(pdf_file_path)
list_path=os.path.join(root_folder,"knowledge_dict_list.json")
js.write_json(dict_list,list_path)
print(dict_list)


# 文件内容重建

In [None]:
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_sub_dict(sub_dict, service, js):
    json_file_path = sub_dict["文件目录"]
    parts = {}
    for key, value in sub_dict.items():
        if key.startswith("方法"):
            parts[key] = value
    results = []
    for part_key, part_value in parts.items():
        prompt_1 = f'''
        请你从这篇pdf论文中，找到这个数模方法{part_value}对应的完整建模与使用过程,对应解释并全部输出，其中的数学公式使用latex语法，要求你输出的格式是json dict格式：
        {{
            "{part_value}":"original_markdown_str..."            
        }}
        请original_markdown_str尽量调整得在一行，尽量不换行
        '''
        prompt_2 = f'''
        请你从这篇pdf论文中，找到这个数模方法{part_value}对应的完整建模与使用过程,对应解释并全部输出，不许添加任何无关内容，第一个字就必须是此方法在原文中的对应第一个字
        '''
        attempt = 0
        while attempt < 3:  # 最多重试3次
            try:
                prompt = prompt_1 if attempt == 0 else prompt_2
                answer = service.chat_with_file(json_file_path, prompt)
                
                if any(error in answer for error in ['invalid_parameter_error', 'RequestTimeOut']) and 'please try again later' in answer:
                    raise Exception('File parsing in progress or request timed out, please try again later.')
                
                if attempt == 0:
                    answer_dict = js.parse_dict(answer)
                else:
                    # 第二次尝试的处理逻辑
                    answer_dict = {part_value: answer}
                answer_dict["文件目录"] = json_file_path
                print(json_file_path, part_value, answer_dict, attempt)
                results.append(answer_dict)
                break
            except Exception as e:
                if any(error in str(e) for error in ['rate_limit_reached_error', 'File parsing in progress', 'Request timed out']):
                    wait_time = (2 ** attempt) * 10  # 指数退避策略
                    print(f"处理 {json_file_path} 中的 {part_value} 时遇到速率限制、文件解析未完成或请求超时，等待 {wait_time} 秒后重试...")
                    time.sleep(wait_time)
                    attempt += 1
                else:
                    print(f"处理 {json_file_path} 中的 {part_value} 时出错: {e}")
                    if attempt == 0:
                        print(f"尝试使用替代提示词和处理逻辑处理 {json_file_path} 中的 {part_value} ...")
                        attempt += 1
                    else:
                        break
    return results

def merge_dicts(dicts):
    merged = {}
    for d in dicts:
        file_path = d.pop("文件目录")
        if file_path not in merged:
            merged[file_path] = {}
        merged[file_path].update(d)
    return merged

def read_and_process_json(file_path, service, js):
    dict_list = js.read_json(file_path)

    # 合并相同文件目录的子字典
    combined_dicts = {}
    for sub_dict in dict_list:
        file_path = sub_dict["文件目录"]
        if file_path not in combined_dicts:
            combined_dicts[file_path] = {}
        combined_dicts[file_path].update(sub_dict)

    results = []
    with ThreadPoolExecutor(max_workers=40) as executor:
        future_to_sub_dict = {executor.submit(process_sub_dict, sub_dict, service, js): sub_dict for sub_dict in combined_dicts.values()}
        for future in as_completed(future_to_sub_dict):
            try:
                result = future.result()
                results.extend(result)
            except Exception as e:
                print(f"处理子字典时出错: {e}")

    merged_results = merge_dicts(results)
    return merged_results

# 文件路径
file_path = "D:\\Joining\\Joining-Agents0622\\Task0_0622_MCM_KG_analyser\\method_dict_list.json"

# 调用函数并获取结果
merged_results = read_and_process_json(file_path, service, js)

# 打印最终结果
print(json.dumps(merged_results, ensure_ascii=False, indent=4))


In [None]:
js.write_json(merged_results,'detailed_method_dict_list')