In [1]:
# from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# from transformers import pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
import torch
import json
from tqdm import tqdm
import pandas as pd

2025-01-21 12:05:52.721013: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-21 12:05:53.009857: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tokenizer = AutoTokenizer.from_pretrained("/mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-7B-Instruct",
                                          trust_remote_code=True)
device = torch.device('cuda:0')  # the device to load the model onto
model = AutoModelForCausalLM.from_pretrained("/mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-7B-Instruct",
                                             device_map={"": device},
                                             torch_dtype="auto",
                                             trust_remote_code=True)
model.to(device)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    # do_sample=True,
    temperature=0.1,
    top_p=0.95,
    top_k=40
)
hf = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
  hf = HuggingFacePipeline(pipeline=pipe)


### 实验组

In [3]:
# 读取 JSON 文件
file_path = 'aikps_output.json'  # 替换为你的文件路径

# 打开并读取 JSON 文件
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 初始化一个空列表，用于存储结果
results = []

# 使用 tqdm 创建进度条
for item in tqdm(data, desc="处理进度"):
    question = item[0]
    context = item[1][0]
    template ="""### 这是一道小学三年级人工智能学科的问题，请使用以下上下文来增强你的回答。
        {context}
        ### 问题: {question}
        ### 以{{"答案": ""}}的JSON格式返回最终结果。
    """
    prompt = PromptTemplate.from_template(template)
    # 假设 hf 是已经定义好的处理函数
    chain = prompt | hf
    # 修改传递给chain.invoke的参数，确保变量名和模板中的一致
    result = chain.invoke({"context": context, "question": question})

    # 将 context、question 和答案添加到结果列表中
    results.append([context, question, result])

# 将结果列表转换为 DataFrame
df = pd.DataFrame(results, columns=['Context', 'Question', 'Answer'])

# 将 DataFrame 写入新的 Excel 文件
excel_path = 'Qwen-实验组.xlsx'  # 替换为你想要保存的 Excel 文件路径
df.to_excel(excel_path, index=False)

处理进度:   7%|▋         | 10/153 [01:48<25:21, 10.64s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
处理进度: 100%|██████████| 153/153 [31:11<00:00, 12.23s/it]


### 对照组

In [3]:
# 读取 JSON 文件
file_path = 'aikps_output.json'  # 替换为你的文件路径

# 打开并读取 JSON 文件
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 初始化一个空列表，用于存储结果
results = []

# 使用 tqdm 创建进度条
for item in tqdm(data, desc="处理进度"):
    question = item[0]
    template ="""### 这是一道小学三年级人工智能学科的问题，请给出你的答案。
        ### 问题: {question}
        ### 以{{"答案": ""}}的JSON格式返回最终结果。
    """
    prompt = PromptTemplate.from_template(template)
    # 假设 hf 是已经定义好的处理函数
    chain = prompt | hf
    # 修改传递给chain.invoke的参数，确保变量名和模板中的一致
    result = chain.invoke({"question": question})

    # 将 context、question 和答案添加到结果列表中
    results.append([question, result])

# 将结果列表转换为 DataFrame
df = pd.DataFrame(results, columns=['Question', 'Answer'])

# 将 DataFrame 写入新的 Excel 文件
excel_path = 'Qwen-对照组.xlsx'  # 替换为你想要保存的 Excel 文件路径
df.to_excel(excel_path, index=False)

处理进度:   7%|▋         | 10/153 [01:15<14:14,  5.97s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
处理进度: 100%|██████████| 153/153 [30:23<00:00, 11.92s/it]
