# 1、将格式转变为instruction和output

In [1]:
import json

def rename_fields(input_file, output_file):
    """
    将JSON文件中的字段`question`重命名为`instruction`，`answer`重命名为`output`，并将结果保存到一个新文件中。

    Args:
        input_file (str): 输入JSON文件的路径。
        output_file (str): 输出JSON文件的路径。
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f_in:
            data = json.load(f_in)

        # 遍历每个条目并重命名字段
        for entry in data:
            if 'question' in entry:
                entry['instruction'] = entry.pop('question')
            if 'answer' in entry:
                entry['output'] = entry.pop('answer')

        # 将修改后的数据保存到输出文件
        with open(output_file, 'w', encoding='utf-8') as f_out:
            json.dump(data, f_out, indent=4, ensure_ascii=False)

        print(f"Successfully renamed fields and saved to '{output_file}'.")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{input_file}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


input_file = '/remote-home/xiaoyili/2025-Medical/Reflection_Tuning/data/huatuo_26m_lite/huatuo_26m_lite_score_5.json'  # 替换为你的输入JSON文件路径
output_file = '/remote-home/xiaoyili/2025-Medical/Reflection_Tuning/data/huatuo_26m_lite/huatuo_26m_lite_score_5_alpaca.json'  # 替换为你的输出JSON文件路径

rename_fields(input_file, output_file)

Successfully renamed fields and saved to '/remote-home/xiaoyili/2025-Medical/Reflection_Tuning/data/huatuo_26m_lite/huatuo_26m_lite_score_5._alpaca.json'.


# 2、提取指定条数

In [3]:
import json

def extract_and_save(input_file, output_file, num_to_extract):
    """
    从 JSON 文件中提取指定数量的条目，并将它们保存到另一个 JSON 文件。

    Args:
        input_file (str): 输入 JSON 文件的路径。
        output_file (str): 输出 JSON 文件的路径。
        num_to_extract (int): 要提取的条目数量。
    """

    try:
        with open(input_file, 'r', encoding='utf-8') as f_in:
            data = json.load(f_in)

        # 确保数据是一个列表
        if not isinstance(data, list):
            print(f"Error: Input file '{input_file}' does not contain a JSON list.")
            return

        # 提取指定数量的条目
        extracted_data = data[:num_to_extract]

        # 保存提取的数据到输出文件
        with open(output_file, 'w', encoding='utf-8') as f_out:
            json.dump(extracted_data, f_out, indent=4, ensure_ascii=False)  # 使用 indent=4 格式化输出，ensure_ascii=False 支持中文

        print(f"Successfully extracted {num_to_extract} entries from '{input_file}' and saved them to '{output_file}'.")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{input_file}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

num_to_extract = 100  # 替换为你想要提取的条目数量
input_file = '/remote-home/xiaoyili/2025-Medical/Reflection_Tuning/data/huatuo_26m_lite/huatuo_26m_lite_score_5_alpaca.json'  # 替换为你的输入 JSON 文件名
output_file = '/remote-home/xiaoyili/2025-Medical/Reflection_Tuning/data/huatuo_26m_lite/huatuo_26m_lite_score_5_alpaca_{}.json'.format(num_to_extract)  # 替换为你的输出 JSON 文件名

extract_and_save(input_file, output_file, num_to_extract)

Successfully extracted 100 entries from '/remote-home/xiaoyili/2025-Medical/Reflection_Tuning/data/huatuo_26m_lite/huatuo_26m_lite_score_5_alpaca.json' and saved them to '/remote-home/xiaoyili/2025-Medical/Reflection_Tuning/data/huatuo_26m_lite/huatuo_26m_lite_score_5_alpaca_100.json'.
