# 初始化

In [1]:
import os
import time
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from LLM_API import GLMService, SenseService, KimiService
from Json_Processor import JSProcessor
from dotenv import load_dotenv
from queue import Queue
import concurrent.futures
import random
import json
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed

# 加载环境变量
dotenv_path = os.path.join(os.getcwd(), '.env')

# 设置项目根目录和图片目录
project_root = os.path.dirname(dotenv_path)

service_type = 'kimi'

def initialize_service(service_type):
    if service_type in ['zhipu', None]:
        version = 'glm-3-turbo'
        #'glm-4' 'glm-4v' 'glm-3-turbo'
        service = GLMService(version)
    elif service_type in ['kimi']:
        version = '8k'
        #'8k'1M/12￥ '32k'1M/24￥ '128k'1M/60￥
        service = KimiService(version)
    elif service_type in ['sensetime']:
        version = 'SenseChat'
        #SenseChat SenseChat-32K SenseChat-128K SenseChat-Turbo SenseChat-FunctionCall
        service = SenseService(version=version)
    else:
        raise ValueError('未知的服务类型')
    
    return service

service = initialize_service(service_type)

js=JSProcessor()

class ParseError(Exception):
    def __init__(self, code, message="解析失败"):
        self.code = code
        self.message = message
        super().__init__(self.message)


## 设置工作目录
期待输出的json能够在这个目录下遵循以下结构：
- /root_folder
    - Raw_Books.json
    - /step_1_books
        - /step_1_processed_books
            - lv1_lv2_lv3_results.json
        - /step_1_unprocessed_books
            - Remain_Books.json
            - /json_lists
                - lv1_unprocessed_books.json
    - /step_2_books
        - /step_2_processed_books
            - transformed_results.json
        - /step_2_mappings
            - mapping_dict.json
            - mapping_embedding_dict.json

### 设置地址变量：

In [2]:
root_folder='Edu_KG'

step_1_processed_path=os.path.join(root_folder, 'step_1_books', 'step_1_processed_books')
step_1_unprocessed_path=os.path.join(root_folder, 'step_1_books', 'step_1_unprocessed_books','json_lists')
step_2_processed_path=os.path.join(root_folder, 'step_2_books', 'step_2_processed_books')
step_2_mapping_path=os.path.join(root_folder, 'step_2_books', 'step_2_mappings')

# 模型和分词器的本地路径
model_path = "D:\Joining\Models\Text2Vec_base_zh"
catalog_file_path = os.path.join(root_folder, 'Raw_Books.json')
step_1_remain_path=os.path.join(root_folder, 'step_1_books', 'step_1_unprocessed_books','Remain_Books.json')

# 书本目录处理为知识点

现在不会覆写了
写了健全的多线程

In [None]:
from threading import Thread, Lock
from queue import Queue, Empty

import traceback
import urllib.error
#签名注释用
from typing import Generator

#原子任务处理函数
def parse_single_file(original_single_dict: dict, target_keys: list) -> dict:
    """
    解析单个文件，并返回解析结果字典。

    参数:
        original_single_dict (dict): 包含原始数据的字典。
        target_keys (list): 目标键列表。

    返回:
        dict: 包含解析结果的字典，格式为 {书名: [知识点1, 知识点2, ...]}。

    Raises:
        ParseError: 当解析失败时引发自定义1001异常
    """
    result = {}

    # 提取书名和目录信息
    book_name = str(original_single_dict.get('书名', '未提供'))
    catalog = str(original_single_dict.get('目录', '未提供'))

    # 构建提示信息
    prompt = f'''
    以下内容是{target_keys[-1]}领域的书籍目录，书名{book_name}，目录内容为：{catalog}，我要求你输出一个python列表，其中的值是知识点，必须是如下结构的一个python列表，由中括号所表示：
    ['知识点1','知识点2','知识点3',...]
    注意：务必精炼，省略一切无关内容
    '''
    try:
        msg = service.ask_once(prompt)
        if msg:
            parse_success = js.parse_list(msg)
            if parse_success:
                print(f"成功解析：{book_name}")
                result[book_name] = parse_success
            else:
                # 解析失败，抛出异常
                error_message = f"解析失败：书名 {book_name}"
                error_code = 1001  # 自定义错误代码
                raise ParseError(error_code, error_message)
    except ParseError as e:
        # 如果是解析失败的异常，直接向上抛出，不做处理
        raise e
    except Exception as e:
        if 'Error code: 400' in str(e):
            error_code = 400
            error_message ="发生400错误，跳过当前处理"
            return None
        elif 'Error code: 429' in str(e):
            error_code = 429
            error_message ="发生429错误，等待30秒后继续执行"
        else:
            error_code = 1000
            error_message =f"发生未知错误{e}"
        raise ParseError(error_code, error_message)
        
    return result
# 辅助函数：收集字典键的路径
def collect_key_paths(current_dict: dict, current_path: list = []) -> Generator[list, None, None]:
    """
    递归收集字典键的路径。

    参数:
        current_dict (dict): 当前正在遍历的字典对象。
        current_path (list, 可选): 当前的键路径。默认为一个空列表。

    返回:
        生成器: 生成当前字典中所有键的路径。
        路径列表：[lv1, lv2, lv3..., leaf]

    注意:
        该函数用于递归遍历字典对象，并生成所有键的路径。
        返回的生成器会逐个生成路径列表，每个列表表示从字典的根节点到叶子节点的路径。
    """
    if isinstance(current_dict, dict):  # 确保当前对象是字典
        for key, value in current_dict.items():
            new_path = current_path + [key]
            if isinstance(value, dict):  # 如果值也是字典，则继续递归
                yield from collect_key_paths(value, new_path)
            else:
                yield new_path
    else:
        yield current_path

#—————————————————————————————————————————————————————————————————————————————————————
#多线程处理部分
#—————————————————————————————————————————————————————————————————————————————————————
#任务生成函数
def task_generator(original_dict: dict, target_categories: list):
    """
    根据目标类别列表生成待处理的任务。
    
    参数:
        original_dict (dict): 包含原始数据的字典。
        target_categories (list): 目标类别列表，只包含希望处理的第一层键。

    返回:
        Generator: 生成包含任务元组的生成器，每个元组包含一个字典和对应的目标键前三层。
    """
    for first_level_key in original_dict:
        if first_level_key in target_categories:  # 检查第一层键是否在目标范围内
            sub_dict = original_dict[first_level_key]
            for second_level_key in sub_dict:
                for third_level_key in sub_dict[second_level_key]:
                    item_list = sub_dict[second_level_key][third_level_key]
                    if isinstance(item_list, list) and item_list:  # 确保是非空列表
                        for item_dict in item_list:
                            if isinstance(item_dict, dict) and item_dict:  # 确保是非空字典
                                # 提取前三层键作为目标键，并加上当前非空字典
                                target_keys = [first_level_key, second_level_key, third_level_key]
                                yield (item_dict, target_keys)

#单线程工作函数
def worker(task_queue: Queue, result_dict: dict, lock: Lock):
    """
    工作线程函数，从任务队列中获取任务并处理。

    参数:
        task_queue (Queue): 任务队列，存储待处理的任务。
        result_dict (dict): 存储解析结果的字典。
        lock (Lock): 线程锁，用于确保结果字典的安全访问。

    返回:
        无。

    注意:
        该函数循环从任务队列中获取任务，并处理解析过程。
        每个任务包括一个原始单个字典和目标键列表。
        在最多允许重试两次的情况下，尝试解析原始单个字典，并将结果存储到结果字典中。
        解析失败时，根据错误代码采取相应的操作，例如重试、休眠等。
        最后，标记任务完成并释放锁，确保线程安全操作。
    """
    while True:
        task_acquired = False  # 添加标志变量
        try:
            original_single_dict, target_keys = task_queue.get(block=False)
            task_acquired = True  # 成功获取任务后，设置标志变量为True
            retry_count = 0
            while retry_count <= 2:  # 允许最多重试两次
                try:
                    parsed_result = parse_single_file(original_single_dict, target_keys)
                    with lock:
                        # 按层级结构存储解析结果
                        current_dict = result_dict
                        for key in target_keys[:-1]:  # 遍历至倒数第二个键，逐层深入
                            current_dict = current_dict.setdefault(key, {})
                        current_dict[target_keys[-1]] = {**current_dict.get(target_keys[-1], {}), **parsed_result}
                    break  # 解析成功，跳出重试循环
                
                except ParseError as e:
                    if e.code == 1001:
                        retry_count += 1
                        if retry_count > 2:
                            print(f"解析错误: {e.message}, 已达最大重试次数")
                            break
                    elif e.code == 429:
                        time.sleep(20)  # 休眠20秒后重试
                    elif e.code == 401 or e.code not in [1001, 429]:
                        print(f"解析错误: {e.message}, 跳过")
                        break
        except Empty:  # 使用正确的异常类型
            break
        except Exception as e:
            print(f"未预料的错误: {traceback.format_exc()}")
        finally:
            if task_acquired:  # 只有在成功获取任务后才调用task_done()
                task_queue.task_done()

#这个函数很需要扔到类里面
def save_to_file(folder_path, file_name, data):
    """将数据增量保存到指定的文件中。"""
    file_path = os.path.join(folder_path, file_name)
    try:
        # 尝试读取现有数据
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                existing_data = json.load(file)
        else:
            existing_data = {}
    except json.JSONDecodeError:
        # 如果文件内容不是有效的JSON，初始化为空字典
        existing_data = {}

    # 合并数据
    merged_data = {**existing_data, **data}

    # 写回文件
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(merged_data, file, ensure_ascii=False, indent=4)

#—————————————————————————————————————————————————————————————————————————————————————
#执行处理部分
#—————————————————————————————————————————————————————————————————————————————————————
#执行函数
def multi_thread_parse(original_dict: dict, target_categories: list, thread_number: int, folder_path: str) -> dict:
    """
    多线程解析函数，解析原始字典中的数据，并将结果保存到文件。

    参数:
        original_dict (dict): 包含原始数据的字典。
        target_categories (list): 目标类别列表，包含希望处理的前三层键。
        thread_number (int): 线程数量，用于并行处理任务。
        folder_path (str): 结果文件夹路径，用于保存解析结果文件。

    返回:
        dict: 包含解析结果的字典。

    注意:
        该函数使用多线程并行处理任务，解析原始字典中的数据。
        首先，生成任务并放入任务队列中。
        然后，创建并启动指定数量的工作线程，每个线程从队列中获取任务并处理。
        等待所有任务完成后，保存结果到指定的文件夹中。
        最后，返回包含解析结果的字典。
    """
    task_queue = Queue()
    result_dict = {}
    lock = Lock()

    # 生成任务并放入队列
    for task in task_generator(original_dict, target_categories):
        task_queue.put(task)

    # 创建并启动线程
    threads = []
    for _ in range(thread_number):  # 可以根据实际情况调整线程数
        thread = Thread(target=worker, args=(task_queue, result_dict, lock))
        thread.start()
        threads.append(thread)

    # 等待所有任务完成
    task_queue.join()

    # 等待所有线程结束
    for thread in threads:
        thread.join()

    file_name = 'step_1_processed.json'
    # 保存结果到文件
    save_to_file(folder_path, file_name, result_dict)

    return result_dict

# **主处理功能块1：**
- /Edu_KG
    - Raw_Books.json
    - /step_1_books
        - /step_1_processed_books
            - step_1_processed.json
        - /step_1_unprocessed_books
            - Remain_Books.json
            - /json_lists
                - lv1_unprocessed_books.json

# **执行功能块1_1**

In [None]:
original_dict=js.read_json(step_1_remain_path)
target_categories=[]
for key, item in original_dict.items():
    target_categories.append(key)

multi_thread_parse(original_dict,target_categories,thread_number=50,folder_path=step_1_processed_path)

#  对比处理完成量

In [5]:
def simplify_original_dict(d, simplified=None):
    """递归简化original_dict的结构"""

    if simplified is None:
        simplified = {}
    for k, v in d.items():
        if isinstance(v, list):
            # 假设列表中的元素都是包含'书名'键的字典
            simplified[k] = [{'书名': book['书名']} for book in v]
        elif isinstance(v, dict):
            simplified[k] = simplify_original_dict(v, {})
    return simplified
def simplify_processed_dict(d, simplified=None):
    """递归简化processed_dict的结构"""
    if simplified is None:
        simplified = {}
    for k, v in d.items():
        if isinstance(v, dict):
            if all(isinstance(val, list) for val in v.values()):
                # 假设如果字典的所有值都是列表，则这是lv3层
                simplified[k] = {sub_k: None for sub_k in v}
            else:
                simplified[k] = simplify_processed_dict(v, {})
    return simplified



In [19]:
def subtract_dicts(original_dict, processed_dict, count_dict=None):
    """
    从original_dict中减去processed_dict中的内容，返回结果字典。
    同时，通过count_dict参数统计原始、处理后(减去的部分)、最后剩下的书的数量。
    """
    if count_dict is None:
        count_dict = {'original': 0, 'processed': 0, 'remaining': 0}

    result_dict = {}
    for key, value in original_dict.items():
        if key not in processed_dict:
            result_dict[key] = value
            if isinstance(value, list):
                count_dict['original'] += len(value)
                count_dict['remaining'] += len(value)
        elif isinstance(value, dict):
            result_dict[key] = subtract_dicts(value, processed_dict.get(key, {}), count_dict)
        elif isinstance(value, list) and all(isinstance(item, dict) for item in value):
            result_list = []
            processed_books = processed_dict.get(key, {})
            for book in value:
                if "书名" in book and book["书名"] not in processed_books:
                    result_list.append(book)
                    count_dict['remaining'] += 1
                else:
                    count_dict['processed'] += 1
            if result_list:
                result_dict[key] = result_list

    return result_dict

def find_unprocessed_books(original_json_path, step_1_processed_path, export_json_folder, top_level_keys):
    results = {}

    with open(original_json_path, 'r', encoding='utf-8') as f:
        original_data = json.load(f)
    
    with open(step_1_processed_path, 'r', encoding='utf-8') as f:
        processed_data = json.load(f)

    for top_level_key in top_level_keys:
        count_dict = {'original': 0, 'processed': 0, 'remaining': 0}
        if top_level_key in original_data:
            unprocessed_data = subtract_dicts(original_data[top_level_key], processed_data.get(top_level_key, {}), count_dict)

            results[top_level_key] = {
                "已处理的书籍数量": count_dict['processed'],
                "未处理的书籍数量": count_dict['remaining'],
                "总书籍数量": count_dict['processed']+count_dict['remaining'],
                "处理比例": count_dict['processed'] / (count_dict['processed']+count_dict['remaining']) if (count_dict['processed']+count_dict['remaining']) else 0,
            }

            export_path = os.path.join(export_json_folder, f"{top_level_key}_unprocessed_books.json")
            with open(export_path, 'w', encoding='utf-8') as f:
                json.dump({top_level_key: unprocessed_data}, f, ensure_ascii=False, indent=4)

    return results


In [20]:
original_json_path=step_1_remain_path
data=js.read_json(original_json_path)
data_list=[]
for k,v in data.items():
    data_list.append(k)
find_unprocessed_books(original_json_path,os.path.join(step_1_processed_path,'step_1_processed.json'),export_json_folder=step_1_unprocessed_path,top_level_keys=data_list)

{'工商管理类': {'已处理的书籍数量': 69,
  '未处理的书籍数量': 9,
  '总书籍数量': 78,
  '处理比例': 0.8846153846153846}}

In [None]:
def process_data_and_export_excel(catalog_file_path, step_1_processed_path, step_1_unprocessed_path, step_1_remain_path):
    # 读取分类数据
    data = js.read_json(catalog_file_path)
    new_data_list = [key for key in data.keys()]
    lv1_key_names = new_data_list
    
    # 找到未处理的书籍
    unprocessed_data = find_unprocessed_books(catalog_file_path, step_1_processed_path, step_1_unprocessed_path, lv1_key_names)
    
    # 读取未处理的书籍数据
    directory = step_1_unprocessed_path
    all_data = {}
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                data = json.load(file)
                for key, value in data.items():
                    if key in all_data:
                        all_data[key].extend(value)
                    else:
                        all_data[key] = value
    
    # 将所有数据写入到一个新的JSON文件中
    with open(step_1_remain_path, 'w', encoding='utf-8') as file:
        json.dump(all_data, file, ensure_ascii=False, indent=4)
    
    # 将数据转换为DataFrame格式
    data = {
        '类别': [],
        '已处理的书籍数量': [],
        '未处理的书籍数量': [],
        '总书籍数量': [],
        '处理比例': []
    }

    for category, info in unprocessed_data.items():
        data['类别'].append(category)
        data['已处理的书籍数量'].append(info['已处理的书籍数量'])
        data['未处理的书籍数量'].append(info['未处理的书籍数量'])
        data['总书籍数量'].append(info['总书籍数量'])
        data['处理比例'].append(info['处理比例'])

    df = pd.DataFrame(data)

    # 将DataFrame保存为Excel文件
    excel_path = '处理情况.xlsx'
    df.to_excel(excel_path, index=False)

    print(f"数据已保存到 {excel_path}")


## **辅助处理：功能块1_2**

In [None]:
process_data_and_export_excel(catalog_file_path, step_1_processed_path, step_1_unprocessed_path, step_1_remain_path)

# 知识点第二层聚合

In [None]:
# 初始化线程锁和队列
lock = threading.Lock()
tasks_queue = Queue()  # 待认领任务的队列
merged_results={}

#单片任务执行
def transform_function(input_dict, NotSuccess_count=0) -> dict:
    file_name = input_dict['学科名']
    lv1, lv2, lv3, _ = file_name.split('_')
    pre_prompt = f'这是{file_name}一些教材的知识点：{input_dict}我希望你帮我融合以上知识点列表，对于属于此学科的知识点，相同的部分合并精炼，不同的部分互相补充，并自行聚类总结知识点，对于无关内容直接摒弃，输出的结果最好是如下结构：'
    pro_prompt = f'恰当地设置聚类标准，使得一个类别下的内容相似度尽可能高，同时一个类别下的内容数量尽可能少。并且，这些类别确实能够组织起这个学科的知识体系。请注意，只要两层结构'
    prompt = js.generate_prompt(level=3, pre_prompt=pre_prompt, pro_prompt=pro_prompt, words='知识点')
    NotSuccess = True
    while NotSuccess:
        try:
            response = service.ask_once(prompt)
            result = js.parse_dict(response)
            if result:
                NotSuccess = False
                print(type(result), result)
                return result, lv1, lv2, lv3
            else:
                print('无结果')
                NotSuccess_count += 1
                if NotSuccess_count >= 2:
                    print("已经连续2次未成功解析，跳过当前处理")
                    return None
                continue
        except Exception as e:
            print(f"发生异常: {e}")
            if 'Error code: 400' in str(e):
                # 如果异常信息中包含'400'，则是400错误，直接跳过当前处理
                print("发生400错误，跳过当前处理")
                return None
            elif 'Error code: 429' in str(e):
                # 如果异常信息中包含'429'，则是429错误，等待30秒后继续执行
                print("发生429错误，等待30秒后继续执行")
                time.sleep(30)
                continue
            else:
                # 其他类型的异常，可以添加相应的处理代码
                return None

#切片
def segment_dict(input_dict, file_name, window_length=1800):
    segments = []  # 用于存储所有的小字典（即数据段）
    current_segment = {'学科名': file_name}  # 当前正在构建的小字典
    current_length = 0  # 当前小字典的累计长度

    for key, value in input_dict.items():
        item = {key: value}  # 当前遍历到的键值对构成的小字典
        item_length = len(str(item))  # 估算当前键值对的长度

        # 如果单个item的长度就超过window_length，直接单个成段
        if item_length > window_length:
            if current_segment != {'学科名': file_name}:  # 确保当前段如果有内容就先保存
                segments.append(current_segment)
                current_segment = {'学科名': file_name}  # 重置当前段
                current_length = 0
            current_segment.update(item)
            segments.append(current_segment)  # 直接将当前item作为新的段添加
            continue

        # 判断是否将当前键值对添加到当前小字典，或者开始新的小字典
        if current_length + item_length <= window_length:
            current_segment.update(item)
            current_length += item_length
        else:
            segments.append(current_segment)  # 保存当前小字典
            current_segment = {'学科名': file_name}  # 重置当前段
            current_segment.update(item)  # 开始新的小字典
            current_length = item_length

    # 确保最后一个小字典也被添加
    if current_segment:
        segments.append(current_segment)

    return segments

#输出格式处理函数
def convert_sets_to_lists(data):
    """
    递归遍历数据结构，将所有的set转换为list。
    """
    if isinstance(data, dict):
        for key, value in data.items():
            data[key] = convert_sets_to_lists(value)
    elif isinstance(data, list):
        for i, item in enumerate(data):
            data[i] = convert_sets_to_lists(item)
    elif isinstance(data, set):
        return list(data)
    return data

def process_task(task):
    # 线程安全地处理任务并更新结果
    result, lv1, lv2, lv3 = transform_function(task)
    
    with lock:  # 确保合并操作的线程安全
        # 确保嵌套层次存在
        if lv1 not in merged_results:
            merged_results[lv1] = {}
        if lv2 not in merged_results[lv1]:
            merged_results[lv1][lv2] = {}
        if lv3 not in merged_results[lv1][lv2]:
            merged_results[lv1][lv2][lv3] = {}

        # 将结果合并到全局字典中
        for key, value in result.items():
            if key not in merged_results[lv1][lv2][lv3]:
                merged_results[lv1][lv2][lv3][key] = value
            else:
                merged_results[lv1][lv2][lv3][key].extend(value)  # 假设值是列表

def process_file_task(file_name, input_directory):
    """处理单个文件，将其分段后的任务添加到任务队列"""
    full_file_path = os.path.join(input_directory, file_name)
    print(f"正在处理文件: {full_file_path}")

    with open(full_file_path, 'r', encoding='utf-8') as file:
        input_dict = json.load(file)
        # 对文件内容进行分段
        segments = segment_dict(input_dict, file_name)
        # 将分段后的任务添加到队列
        for segment in segments:
            tasks_queue.put(segment)

def process_all_json_files(input_directory, output_directory, num_threads=4):
    # 读取输入目录下的所有文件并初始化任务队列
    file_names = [file for file in os.listdir(input_directory) if file.endswith('_results.json')]
    for file_name in file_names:
        process_file_task(file_name, input_directory)

    # 使用线程池处理分段后的所有任务
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        # 提交分段处理任务到线程池
        while not tasks_queue.empty():
            task = tasks_queue.get_nowait()  # 使用get_nowait避免阻塞
            executor.submit(process_task, task)

    # 等待所有分段处理任务完成
    executor.shutdown(wait=True)

    # 将合并后的结果转换为所需格式
    final_result = convert_sets_to_lists(merged_results)

    # 存储处理结果到指定位置
    output_file_path = os.path.join(output_directory, 'transformed_results.json')
    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(final_result, json_file, ensure_ascii=False, indent=4)


# **主处理功能块2：**

In [None]:
# 指定输入目录和输出目录，以及线程数

input_directory = step_1_processed_path
output_directory = step_2_processed_path
num_threads = 50

process_all_json_files(input_directory, output_directory, num_threads)

# 期待对数据结构做如下处理：
- 统一最末端为{key:null}
- 建立key_path_mapping
- 添加embedding_mapping并寄存

In [None]:
step_2_processed_json_path=os.path.join(step_2_processed_path,'transformed_results.json')

# 格式统一化

In [None]:
import json

def convert_list_items_to_dict(data):
    """
    递归遍历字典结构，将所有包含列表项的字典转换为 {sub_key: null, ...} 的形式。
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, dict):
                # 递归处理字典结构
                convert_list_items_to_dict(value)
            elif isinstance(value, list):
                # 处理列表项
                new_dict = {}
                for item in value:
                    new_dict[item] = None
                data[key] = new_dict
    return data

def process_json_file(input_file_path, output_file_path):
    """
    读取指定源文件地址的 JSON 文件，将其中所有包含列表项的字典转换为 {sub_key: null, ...} 的形式，
    并将处理后的数据保存到指定输出地址的新的 JSON 文件中。
    """
    # 读取 JSON 文件
    with open(input_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # 转换数据
    converted_data = convert_list_items_to_dict(data)
    
    # 写入新的 JSON 文件
    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(converted_data, file, ensure_ascii=False, indent=4)


## **执行块3_1**

In [None]:
process_json_file(step_2_processed_json_path,step_2_processed_json_path)

# 建立键和路径映射
mappings=
[  
{  
  'key_name': key,  
  'key_path': new_prefix.strip('_'),  
  'depth': depth  
},  
...  
]

In [None]:
def build_key_path_mapping(data, prefix='', depth=0, folder_path=''):
    """
    递归遍历字典结构，建立每个键的名称、路径和深度的映射关系，并将结果写入到文件中，
    排除最深层的字典。
    """
    mapping_dict = {}

    if isinstance(data, dict):
        # 检查当前字典是否为最深层的字典
        if all(not isinstance(value, dict) for value in data.values()):
            return mapping_dict

        for key, value in data.items():
            # 构建当前键的完整路径
            new_prefix = f"{prefix}_{key}" if prefix else key
            # 记录当前键的信息
            key_info = {
                'key_name': key,
                'key_path': new_prefix.strip('_'),
                'depth': depth
            }
            mapping_dict[new_prefix.strip('_')] = key_info

            # 递归处理子字典或列表中的字典
            if isinstance(value, dict):
                mapping_dict.update(build_key_path_mapping(value, prefix=new_prefix, depth=depth + 1, folder_path=folder_path))
            elif isinstance(value, list):
                for i, sub_dict in enumerate(value):
                    if isinstance(sub_dict, dict):
                        mapping_dict.update(build_key_path_mapping(sub_dict, prefix=f"{new_prefix}_{i}", depth=depth + 1, folder_path=folder_path))

        # 写入文件
        if folder_path and depth == 0:  # 仅在最外层函数调用时写入文件
            file_path = os.path.join(folder_path, 'mapping_dict.json')
            with open(file_path, 'w', encoding='utf-8') as file:
                json.dump(mapping_dict, file, ensure_ascii=False, indent=4)

    return mapping_dict

# **执行块3_2**

In [None]:
data=js.read_json(step_2_processed_json_path)
folder_path=step_2_mapping_path
mapping_dict=build_key_path_mapping(data,folder_path=folder_path)

# 建立包含embedding的映射

In [None]:
import json
import time
from transformers import AutoModel, AutoTokenizer
import torch

def embed_and_export_dict_batch(given_dict, model_path, json_file_path, batch_size=32, stats_interval=5):
    # 加载模型和分词器
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    model.eval()  # 确保模型处于评估模式

    start_time = time.time()

    # 准备数据，以(key, item)的形式迭代
    items = list(given_dict.items())
    total_keys = len(items)
    batches_processed = 0

    for i in range(0, total_keys, batch_size):
        batch_items = items[i:i+batch_size]
        batch_key_names = [item[1]["key_name"] for item in batch_items]
        inputs = tokenizer(batch_key_names, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()

        # 将嵌入向量添加回原字典
        for j, (key, _) in enumerate(batch_items):
            given_dict[key]["embedding"] = embeddings[j].tolist()

        batches_processed += 1

        # 统计并打印当前速度
        if batches_processed % stats_interval == 0 or (i + batch_size) >= total_keys:
            elapsed_time = time.time() - start_time
            keys_processed = min((batches_processed) * batch_size, total_keys)
            print(f"已处理 {keys_processed}/{total_keys} 个key_name，耗时 {elapsed_time:.2f}秒，速度：{keys_processed / elapsed_time:.2f}个key_name/秒")

    # 全部处理完毕后一次性保存到JSON文件
    with open(json_file_path, 'w', encoding='utf-8') as file:
        json.dump(given_dict, file, ensure_ascii=False, indent=4)

    print("完成，已将更新后的字典导出到指定的JSON文件中。")


# **执行块3_3**
在这里我们得到包含着embedding的键

In [None]:
given_dict=mapping_dict
json_file_path=os.path.join(step_2_mapping_path,'mapping_embedding_dict.json')
embed_and_export_dict_batch(given_dict,model_path,json_file_path,batch_size=64)

# 逐层比对融合
还回末端字段

In [None]:
def add_null_values_back(original_dict, given_dict, current_path=[], export_folder=None):
    """
    递归地向给定字典中添加原本为null的键值对。
    
    :param original_dict: 原始的字典。
    :param given_dict: 经过处理的字典，需要在这个字典中添加值。
    :param current_path: 当前的遍历路径，用于构建给定字典中的键。
    :param export_folder: 导出文件夹的路径。
    """
    for key, value in original_dict.items():
        # 更新当前路径
        new_path = current_path + [key]
        
        if isinstance(value, dict):
            # 检查是否是最底层的dict
            if all(v is None for v in value.values()):
                # 如果是最底层的dict，构建在given_dict中对应的键
                given_dict_key = "_".join(new_path)
                # 在given_dict中找到对应的项并添加"value"键
                given_dict_item = given_dict.get(given_dict_key)
                if given_dict_item is not None:
                    given_dict_item['value'] = {sub_key: None for sub_key in value}
            else:
                # 如果不是最底层的dict，继续递归遍历
                add_null_values_back(value, given_dict, new_path, export_folder=None)
                
    # 导出结果到文件
    if export_folder:
        export_file_path = os.path.join(export_folder, "mapping_with_embedding_value.json")
        with open(export_file_path, 'w', encoding='utf-8') as f:
            json.dump(given_dict, f, ensure_ascii=False, indent=4)

### **执行块3_4** 存入指定文件'mapping_embedding_dict.json'

In [None]:
step_2_processed_json_path=os.path.join(step_2_processed_path,'transformed_results.json')
json_file_path=os.path.join(step_2_mapping_path,'mapping_embedding_dict.json')
original_dict=js.read_json(step_2_processed_json_path)
given_dict=js.read_json(json_file_path)
export_folder=step_2_mapping_path
# 示例使用
add_null_values_back(original_dict, given_dict,export_folder=export_folder)

# 逐层深入，迭代构建并聚合

In [None]:
import numpy as np

def cosine_similarity_matrix(vectors):
    """构建余弦相似度矩阵"""
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1  # 避免除以零
    normalized_vectors = vectors / norms
    similarity_matrix = np.dot(normalized_vectors, normalized_vectors.T)
    return similarity_matrix

def build_mapping_relation(data, depth, similarity_threshold=0.8):
    groups = {}
    # 分组
    for key, item in data.items():
        if item['depth'] == depth:  # 只处理depth=3的键
            prefix = "_".join(key.split("_")[:3])
            groups.setdefault(prefix, []).append(key)

    # 初始化映射关系为字典
    mapping_relation = {}

    for group, keys in groups.items():
        if len(keys) < 2:  # 单个键无需比较
            continue
        # 构建矩阵
        vectors = np.array([data[key]['embedding'] for key in keys])
        # 计算余弦相似度矩阵
        sim_matrix = cosine_similarity_matrix(vectors)
        
        for i, key in enumerate(keys):
            # 找到与当前键相似度超过阈值的所有键
            sim_indices = np.where(sim_matrix[i] > similarity_threshold)[0]
            sim_indices = sim_indices[sim_indices != i]  # 排除自己
            if len(sim_indices) == 0:
                continue  # 没有超过阈值的相似键，跳过

            # 获取相似度超过阈值的键列表
            similar_keys = [keys[j] for j in sim_indices]
            # 检查当前键是否已经作为其他键的相似项被并入
            if not any(key in v for v in mapping_relation.values()):
                mapping_relation[key] = similar_keys

    return mapping_relation


In [None]:
def remove_embeddings(data,depth):
    """
    移除depth小于3的键的embedding属性。
    """
    for key, item in data.items():
        if item['depth'] < depth and 'embedding' in item:
            del item['embedding']

def merge_dicts(data, mapping_relation):
    """
    根据并射集合并字典项。
    """
    for target_key, source_keys in mapping_relation.items():
        for source_key in source_keys:
            source_item = data.get(source_key)
            target_item = data.get(target_key)
            if source_item and target_item:
                if 'value' in target_item:
                    target_item['value'].update(source_item.get('value', {}))
                else:
                    target_item['value'] = source_item.get('value', {})

def update_keys(data, mapping_relation):
    """
    更新字典中的键路径。
    """
    # 反向构建路径替换映射
    replacement_map = {source_key: target_key for target_key, source_keys in mapping_relation.items() for source_key in source_keys}
    # 更新数据
    new_data = {}
    for key, item in data.items():
        # 检查当前键是否需要被替换
        new_key = replacement_map.get(key, key)
        new_data[new_key] = item
    return new_data

In [None]:
import numpy as np

def process_data(data, start_depth=3, similarity_threshold=0.8):
    max_depth = max(item['depth'] for item in data.values())  # 获取最大深度
    for depth in range(start_depth, max_depth + 1):
        # 步骤1: 构建映射关系
        mapping_relation = build_mapping_relation(data, depth, similarity_threshold)
        
        # 步骤2: 移除embedding属性
        remove_embeddings(data, depth)
        
        # 步骤3: 合并字典项
        merge_dicts(data, mapping_relation)
        
        # 步骤4: 更新键路径
        data = update_keys(data, mapping_relation)
    for k,v in data.items():
        if 'embedding' in v:
            del v['embedding']
    
    return data

### **执行块3_5**

In [None]:
data=js.read_json(os.path.join(step_2_mapping_path,'mapping_with_embedding_value.json'))
new_data=process_data(data)
output_path=os.path.join(step_2_mapping_path,'aggregated_dict.json')
js.write_json(new_data,output_path)

In [None]:
output_path=os.path.join(step_2_mapping_path,'aggregated_dict.json')
data=js.read_json(output_path)

# 重建树形结构

In [None]:
def rebuild_tree_structure(flat_dict):
    root = {}

    # 遍历每个键值对构建树
    for full_path, item in flat_dict.items():
        # 分割路径
        parts = full_path.split("_")
        current_level = root

        # 遍历路径的每一部分，逐层深入
        for part in parts[:-1]:
            # 如果当前层级还没有这个部分的键，则创建一个新的字典
            if part not in current_level:
                current_level[part] = {}
            current_level = current_level[part]

        # 对于value字段，需要特别处理
        if 'value' in item and item['value']:
            # 如果当前节点下有value，则将其作为当前节点的子节点
            current_level[parts[-1]] = {k: None for k in item['value']}
        else:
            # 如果没有value字段，或者value为空，则直接将该节点置为null
            current_level[parts[-1]] = None

    return root


In [None]:
def rebuild_tree(dictionary):
    tree = {}
    for key, value in dictionary.items():
        keys = key.split('_')
        current_level = tree
        for k in keys:
            if k not in current_level:
                current_level[k] = {}
            current_level = current_level[k]
        # 确保value是一个字典且包含'value'键，然后更新current_level
        if isinstance(value, dict) and "value" in value:
            # 确保current_level是一个字典
            if isinstance(current_level, dict):
                current_level.update(value["value"])
            else:
                # 如果current_level不是字典，这可能是逻辑上的错误
                print(f"Unexpected type for current_level: {type(current_level)}",k)
    
    # 遍历树，将空字典替换为None
    def replace_empty_with_none(node):
        for k, v in node.items():
            if isinstance(v, dict) and not v:  # 空字典 {}
                node[k] = None
            elif isinstance(v, dict):
                replace_empty_with_none(v)
    
    replace_empty_with_none(tree)
    return tree


In [None]:
tree = rebuild_tree(data)
js.write_json(tree,'test.json')

In [None]:
def format_dict_to_hierarchy(dictionary, depth=0):
    result = ''
    for key, value in dictionary.items():
        result += '  ' * depth + '- ' + key + '\n'
        if isinstance(value, dict):
            result += format_dict_to_hierarchy(value, depth + 1)
    return result

def write_hierarchy_to_file(formatted_hierarchy, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(formatted_hierarchy)