# 初始化

In [1]:
import os
import time
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from local_packages import *
from dotenv import load_dotenv
from queue import Queue
import concurrent.futures
import random
import json
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed

# 加载环境变量
dotenv_path = os.path.join(os.getcwd(), '.env')

# 设置项目根目录和图片目录
project_root = os.path.dirname(dotenv_path)

service_type = 'qwen'

def initialize_service(service_type):
    if service_type in ['zhipu', None]:
        version = 'glm-3-turbo'
        #'glm-4' 'glm-4v' 'glm-3-turbo'
        service = GLMService(version)
    elif service_type in ['qwen']:
        version = 'long'
        service=QwenService(version)
    elif service_type in ['kimi']:
        version = '8k'
        #'8k'1M/12￥ '32k'1M/24￥ '128k'1M/60￥
        service = KimiService(version)
    elif service_type in ['sensetime']:
        version = 'SenseChat'
        #SenseChat SenseChat-32K SenseChat-128K SenseChat-Turbo SenseChat-FunctionCall
        service = SenseService(version=version)
    else:
        raise ValueError('未知的服务类型')
    
    return service

service = initialize_service(service_type)

js=JSProcessor()

class ParseError(Exception):
    def __init__(self, code, message="解析失败"):
        self.code = code
        self.message = message
        super().__init__(self.message)


服务初始化成功


# 地址创建

In [2]:
import os

# 定义根文件夹和模型路径
root_folder = 'Task1_xlsx_processor'
model_path = r'D:\Joining\Models\Text2Vec_base_zh'


# 创建所有需要的目录
directories = [
    root_folder
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)

print("所有目录和文件已成功创建。")

xlsx_file_path = r"D:\Joining\Joining-Agents0529\大模型日报汇编-论文(1).xlsx"


所有目录和文件已成功创建。


# xlsx转换成json

In [None]:
import pandas as pd
import json

def xlsx_to_json(xlsx_file_path, json_file_path):
    try:
        # 读取 XLSX 文件
        df = pd.read_excel(xlsx_file_path)
        
        # 将 DataFrame 转换为 JSON 格式
        json_data = df.to_json(orient='records', force_ascii=False)
        
        # 将 JSON 数据写入文件
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json_file.write(json_data)
        
        print(f"成功将 {xlsx_file_path} 转换为 {json_file_path}")
    except Exception as e:
        print(f"转换时发生错误: {e}")

# 示例用法
json_file_path = r'D:\Joining\Joining-Agents0529\Task23_xlsx_opener\converted.json'
xlsx_to_json(xlsx_file_path, json_file_path)


# 添加标记符

In [None]:
def add_recog_index_to_dict_list(json_file_path):
    try:
        # 读取 JSON 文件
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            dict_list = json.load(json_file)
        
        # 为每个字典添加 recog_index 键
        for index, sub_dict in enumerate(dict_list):
            sub_dict['recog_index'] = index
        
        # 将包含字典的列表转换为 JSON 格式
        json_data = json.dumps(dict_list, ensure_ascii=False, indent=4)
        
        # 将 JSON 数据写入文件
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json_file.write(json_data)
        
        print(f"成功在 {json_file_path} 中添加 recog_index")
    except Exception as e:
        print(f"添加 recog_index 时发生错误: {e}")
        
add_recog_index_to_dict_list(json_file_path)

# 处理json
读出list的sub_dict中所有value为null的key

In [None]:
dict_list=js.read_json(r'D:\Joining\Joining-Agents0529\Task23_xlsx_opener\converted.json')

# 找到空字典键值对
def find_empty_values(dict_list):
    empty_values_index = {}
    for index, sub_dict in enumerate(dict_list):
        for key, value in sub_dict.items():
            if pd.isna(value) or value == "":
                if key not in empty_values_index:
                    empty_values_index[key] = []
                empty_values_index[key].append(index)
    
    return empty_values_index

data=find_empty_values(dict_list)
empty_value_path=os.path.join(root_folder,'empty.json')
js.write_json(data,empty_value_path)
# 实际上这个数据结构就不再有用了，它只是历史数据结构

# 找到含有:分割符的键

In [None]:
def classify_keys(data):
    colon_keys = {}
    non_colon_keys = {}

    for key, indices in data.items():
        if ':' in key:
            colon_keys[key] = indices
        else:
            non_colon_keys[key] = indices
    
    return colon_keys, non_colon_keys
col,non_col=classify_keys(data)
#这两个数据结构也只是中间结构


# 为池化任务做准备

In [None]:
import json
import os
import pandas as pd

def process_colon_keys(colon_keys, dict_list):
    col_dict_list_dict = {}

    for key, indices in colon_keys.items():
        parts = key.split(':')
        if len(parts) == 3 and parts[0] == "分类":
            classification_type = parts[1]
            classifications = parts[2]

            for index in indices:
                sub_dict = dict_list[index]
                sub_dict["classification_type"] = classification_type
                sub_dict["classifications"] = classifications
                # 删除值为空的键值对
                sub_dict = {k: v for k, v in sub_dict.items() if pd.notna(v) and v != ""}

                if classification_type not in col_dict_list_dict:
                    col_dict_list_dict[classification_type] = []
                col_dict_list_dict[classification_type].append(sub_dict)
    
    return col_dict_list_dict

def process_non_colon_keys(non_colon_keys, dict_list):
    non_col_dict_list_dict = {}

    for key, indices in non_colon_keys.items():
        for index in indices:
            sub_dict = dict_list[index]
            sub_dict["generation_type"] = key
            # 删除值为空的键值对
            sub_dict = {k: v for k, v in sub_dict.items() if pd.notna(v) and v != ""}

            if key not in non_col_dict_list_dict:
                non_col_dict_list_dict[key] = []
            non_col_dict_list_dict[key].append(sub_dict)
    
    return non_col_dict_list_dict
#这两个数据结构是任务池的前数据结构
dict_list=js.read_json(r'D:\Joining\Joining-Agents0529\Task23_xlsx_opener\converted.json')
col_dict_list_dict=process_colon_keys(col,dict_list)
dict_list=js.read_json(r'D:\Joining\Joining-Agents0529\Task23_xlsx_opener\converted.json')
non_col_dict_list_dict=process_non_colon_keys(non_col,dict_list)
col_path=os.path.join(root_folder,"col_dict_list_dict.json")
non_col_path=os.path.join(root_folder,'non_col_dict_list_dict.json')
js.write_json(col_dict_list_dict,col_path)
js.write_json(non_col_dict_list_dict,non_col_path)

# 任务池化

In [None]:
def extract_all_sub_dicts(col_dict_list_dict, non_col_dict_list_dict):
    new_dict_list = []

    for classification_type, sub_dicts in col_dict_list_dict.items():
        new_dict_list.extend(sub_dicts)

    for generation_type, sub_dicts in non_col_dict_list_dict.items():
        new_dict_list.extend(sub_dicts)
    
    return new_dict_list
# 提取所有的sub_dict
new_dict_list = extract_all_sub_dicts(col_dict_list_dict, non_col_dict_list_dict)

# 打印结果
print("提取的sub_dict列表:")
print(json.dumps(new_dict_list, ensure_ascii=False, indent=4))

# 将结果写入 JSON 文件
output_path = os.path.join(root_folder,'new_dict_list.json')
js.write_json(new_dict_list, output_path)

# 多线程处理任务

In [None]:
import json
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_task(sub_dict, timeout=20, max_retries=3):
    skip_keys = {"recog_index", "classification_type", "classifications", "generation_type"}
    prompt = '知道现在有一则如下信息:\n'
    
    for k, v in sub_dict.items():
        if k not in skip_keys:
            term_prompt = str(k) + '是' + str(v) + '\n'
            prompt += term_prompt

    def make_request(prompt, key_name):
        for attempt in range(max_retries):
            try:
                answer = service.ask_once(prompt)
                answer_dict = js.parse_dict(answer)
                if key_name in answer_dict:
                    return answer_dict[key_name]
                else:
                    raise ValueError("返回的字典中缺少期望的键")
            except Exception as e:
                if "429" in str(e) or 'Throttling.RateQuota' in str(e):
                    print(f"429 或 Throttling.RateQuota 错误 {e}: {key_name}，将在 {10 * (attempt + 1)} 秒后重试... (尝试 {attempt + 1})")
                    time.sleep(10 * (attempt + 1))
                else:
                    print(f"处理 {key_name} 时出错: {e}, 尝试 {attempt + 1}")
        return None

    if "classification_type" in sub_dict and "classifications" in sub_dict:
        prompt += f'''请你在如下具体分类中选择最合适的一个: {sub_dict['classifications']}。\n
        请按照json dict格式返回，键名必须为 {sub_dict['classification_type']}，例如：
        {{ "{sub_dict['classification_type']}":"partition_str"}}
        '''
        key_name = sub_dict['classification_type']
        result = make_request(prompt, key_name)
        if result:
            sub_dict[key_name] = result
            print("success")
            return sub_dict
    
    elif "generation_type" in sub_dict:
        prompt += f'''请你依据上述信息回答：{sub_dict['generation_type']}。\n
        请按照json dict格式返回，键名必须为 {sub_dict['generation_type']}，例如：
        {{ "{sub_dict['generation_type']}":"generation_answer"}}'''
        key_name = sub_dict['generation_type']
        result = make_request(prompt, key_name)
        if result:
            sub_dict[key_name] = result
            print("success")
            return sub_dict
    print("failed")
    return sub_dict

def process_all_tasks(new_dict_list, max_workers=60):
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_sub_dict = {executor.submit(process_task, sub_dict): sub_dict for sub_dict in new_dict_list}
        
        for future in as_completed(future_to_sub_dict):
            sub_dict = future_to_sub_dict[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"处理 {sub_dict} 时出错: {e}")

    return results

dict_list=process_all_tasks(new_dict_list)

In [None]:
import pandas as pd

def merge_dicts_by_recog_index(dict_list):
    merged_dict = {}
    
    for sub_dict in dict_list:
        recog_index = sub_dict.get('recog_index')
        if recog_index not in merged_dict:
            merged_dict[recog_index] = sub_dict
        else:
            for k, v in sub_dict.items():
                if k not in merged_dict[recog_index]:
                    merged_dict[recog_index][k] = v

    # 转换为列表
    merged_list = list(merged_dict.values())
    return merged_list

def filter_keys(dict_list, keys_to_remove):
    filtered_list = []

    for sub_dict in dict_list:
        filtered_dict = {k: v for k, v in sub_dict.items() if k not in keys_to_remove}
        filtered_list.append(filtered_dict)
    
    return filtered_list

def save_to_excel(dict_list, file_path):
    df = pd.DataFrame(dict_list)
    df.to_excel(file_path, index=False)

merged_list = merge_dicts_by_recog_index(dict_list)
keys_to_remove = {'recog_index', 'classification_type', 'classifications', 'generation_type'}
filtered_list = filter_keys(merged_list, keys_to_remove)
save_to_excel(filtered_list, 'processed_data.xlsx')

print("处理后的结果已保存到 processed_data.xlsx")
