In [1]:
def read_txt_file(file_path):
    """
    读取指定路径的JSON文件，并返回解析后的对象。
    
    :param file_path: JSON文件的路径
    :return: 解析后的JSON数据（通常为dict或list）
    """
    # 使用with语句打开文件，这样可以确保文件在使用完毕后正确关闭
    with open(file_path, 'r', encoding='utf-8') as file:
        # 使用read()方法读取文件的所有内容，并将它们存储在一个字符串变量中
        data = file.read()
    # print(type(data))
    # 此时，'data'变量包含了文件的所有内容。
    # 注意：如果文件是多行的，那么'data'中的内容也会包含换行符'\n'。
    return(data)

In [3]:
def write_txt_file(file_path, data):
    """
    将给定的数据写入到指定路径的JSON文件中。
    
    :param file_path: JSON文件的路径
    :param data: 要写入的数据（通常为dict或list）
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(data)

In [5]:
import re
def extract_qa_pairs(text):
    """
    从给定的文本中提取问题和答案对，支持两种不同的格式，并去除开头和结尾的特定标记。
    
    参数:
    text (str): 包含问题和答案的文本。
    
    返回:
    list: 每个元素是一个包含问题和答案的子列表。
    """
    try:
        # 去除开始和结束标记之间的多余空格或换行符
        clean_text = text.split('<|begin_of_questions_and_answers|>\n')[1].split('<|end_of_questions_and_answers|>')[0]
    except IndexError:
        raise ValueError("The text does not contain the expected begin and end markers.")
    
    # 定义两种不同格式的正则表达式
    pattern_no_markdown = r'\d+\.\sQuestion:\s(.*?)\s+Answer:\s(.*?)(?=\n\d+\.|\Z)'
    pattern_markdown = r'\d+\.\s\*\*Question:\*\*\s(.*?)\s+\*\*Answer:\*\*\s(.*?)(?=\n\d+\.|\Z)'

    # 查找没有Markdown标记的问题和答案
    qa_pairs_no_md = re.findall(pattern_no_markdown, clean_text, re.DOTALL | re.MULTILINE)
    # 查找有Markdown标记的问题和答案，并去除Markdown标记
    qa_pairs_md = re.findall(pattern_markdown, clean_text, re.DOTALL | re.MULTILINE)
    qa_pairs_md_cleaned = [[q.replace("**", "").strip(), a.replace("**", "").strip()] for q, a in qa_pairs_md]

    # 合并结果
    qa_list = [[q.strip(), a.strip()] for q, a in qa_pairs_no_md] + qa_pairs_md_cleaned
    
    return qa_list

In [9]:
def deepseek_qa(txtfile, txtdir):
    file_path = f"{txtdir}\\{txtfile}"
    txt = read_txt_file(file_path)
    txt2 = read_txt_file(f"{qadir}\\{txtfile}")
    qa_list = extract_qa_pairs(txt2)
    
    count = 0
    print(f"{txtfile} is running")

    for qa in qa_list:
        count += 1
        reasoning_content = ""  # 定义完整思考过程
        answer_content = ""     # 定义完整回复
        is_answering = False   # 判断是否结束思考过程并开始回复
        # print(qa[0])
        # 创建聊天完成请求
        stream = client.chat.completions.create(
            model="qwen-turbo-latest",  # 此处以 deepseek-v3 为例，可按需更换模型名称
            messages=[
                {"role": "user", "content": f"""Based on a comprehensive review in the field of Metal-Organic Frameworks (MOFs) and related questions, generate a detailed and complete chain of scientific reasoning. Ensure that your reasoning process is rigorous and logically coherent, utilizing scientific theories and facts for analysis. The chain of reasoning can be open and flexible, not confined to a rigid structure, but it should clearly indicate the beginning and end of the reasoning.

Please use `<|begin_of_thought|>` to mark the start of the reasoning chain and `<|end_of_thought|>` to mark the end.

Don't mention "this literature show" or "this review show" or anything like that. This is very important. Even if you use the literature, your answer should still give the other person a style of thinking that is all about you.

The thought chain is as detailed as possible.
---

**Example Structure:**

1. **Understanding the Background:** Briefly explain the background information and main questions.
  
2. **Application of Knowledge:** Invoke relevant scientific principles and known facts related to the problem.
  
3. **Analysis Integration:** Integrate key information from the review into the analysis process.
  
4. **Reasoning Expansion:** Use logical reasoning to explore potential paths to a solution.
  
5. **Solution Evaluation:** Assess the plausibility and feasibility of different solutions.
  
6. **Conclusion Formation:** Draw clear scientific conclusions or hypotheses.
  
7. **Open Exploration:** Suggest possible future research directions or applications.
  

**Open Thought Chain Template:**

<|begin_of_thought|>

1. Preliminary Analysis: Clarify the subject and background information.
  
2. Theoretical Application: Identify and apply relevant scientific theories to support the analysis.
  
3. Logical Step-by-Step Reasoning: Gradually expand the reasoning, using review information to deepen the analysis.
  
4. Possibility Discussion: Explore potential conclusions and hypotheses, considering various scientific perspectives.
  
5. Result Summary: Summarize analysis results to form clear scientific conclusions.
  
6. Exploration Directions: Propose possible future research directions or application areas.
  

<|end_of_thought|>

Question:
{qa[0]}

Answer:
{qa[1]}

There is Artical:
{txt}"""}
        ],
        stream=True
        # 解除以下注释会在最后一个chunk返回Token使用量
        # stream_options={
        #     "include_usage": True
        # }
    )


        for chunk in stream:
            # 处理usage信息
            if not getattr(chunk, 'choices', None):
                print("\n" + "=" * 20 + "Token 使用情况" + "=" * 20 + "\n")
                print(chunk.usage)
                continue
    
            delta = chunk.choices[0].delta
    
            # 处理回复内容
            if getattr(delta, 'content', None):
                # print(delta.content, end='', flush=True)
                answer_content += delta.content
    
        # 如果需要打印完整内容，解除以下的注释
    
        # print("=" * 20 + "完整回复" + "=" * 20 + "\n")
        # print(answer_content)
        content = f"""<|begin_of_question|>\n\n{qa[0]}\n\n<|end_of_question|>\n\n<|begin_of_answer|>\n\n{qa[1]}\n\n<|end_of_answer|>\n\n{answer_content}"""
        
        write_txt_file(f"{file_path[:-4]}_{count}.txt", content)
        # print(f"file_path[:-4]_{count}.txt")
    return f"{txtfile} is done"

In [11]:

def main(txtdir):
    futures = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        for txtfile in os.listdir(txtdir):
            if txtfile.endswith('.txt'):  # 确保只处理文本文件
                future = executor.submit(deepseek_qa, txtfile, txtdir)
                futures.append(future)
        
        # 收集结果
        for future in as_completed(futures):
            try:
                result = future.result()
                print(result)
            except Exception as e:
                print(f"生成了一个异常: {e}")

In [13]:

import os
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed

client = OpenAI(
    # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
    api_key="", # 如何获取API Key：https://help.aliyun.com/zh/model-studio/developer-reference/get-api-key
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
qadir = "F:\\Working\\ModelDistillation\\qa"
txtdir = "F:\\Working\\ModelDistillation\\review"
main(txtdir)

10.1002-advs.202203889.pdf.txt is running
10.1002-adma.202408416.pdf.txt is running
10.1002-adma.202412708.pdf.txt is running
10.1002-advs.202409290.pdf.txt is running
10.1002-advs.202304424.pdf.txt is running
10.1002-advs.202304424.pdf.txt is done
10.1002-aenm.202303281.pdf.txt is running
10.1002-adma.202412708.pdf.txt is done
10.1002-anie.202218076.pdf.txt is running
10.1002-advs.202409290.pdf.txt is done
10.1002-jssc.202201057.pdf.txt is running
10.1002-adma.202408416.pdf.txt is done
10.1002-marc.202300730.pdf.txt is running
10.1002-advs.202203889.pdf.txt is done
10.1002-smll.202301130.pdf.txt is running
10.1002-jssc.202201057.pdf.txt is done
10.1002-smll.202402783.pdf.txt is running
10.1002-aenm.202303281.pdf.txt is done
10.1002-smll.202404350.pdf.txt is running
10.1002-anie.202218076.pdf.txt is done
10.1007-s10653-024-01936-1.pdf.txt is running
10.1002-smll.202301130.pdf.txt is done
10.1007-s12274-023-5532-2.pdf.txt is running
10.1002-marc.202300730.pdf.txt is done
10.1007-s12274-