In [10]:
from datasets import load_dataset, Features, Value

correct_with_label_cleaned_dataset_path = './datasets/correct_with_label_cleaned_camp_data.jsonl'

_features = Features({
    'reason': Value('string'),
    'diseases': Value('string'),
    'feature_content': Value('string')
})

formatted_dataset = load_dataset(
    "json", 
    data_files=correct_with_label_cleaned_dataset_path, 
    split="train", 
    features=_features,
)
formatted_dataset

Dataset({
    features: ['reason', 'diseases', 'feature_content'],
    num_rows: 5000
})

In [11]:
def format_to_alpaca(example):
    instruction = "你是一位经验丰富的临床医学专家，对各种疾病的症状、体征、检查结果和诊断标准有着深入的了解。同时，你擅长从病历中提取关键信息，运用专业知识进行综合分析，为疾病的诊断提供准确的依据。"
    input_text = example['feature_content']
    diseases_text = example['diseases']
    reason_text = example['reason']
    output_text = """
    患者可能罹患的疾病是{}
    依据如下:
    {}
    """.format(diseases_text, reason_text)
    return {"instruction": instruction, "input": input_text, "output": output_text}

alpaca_formatted_dataset = formatted_dataset.map(format_to_alpaca, remove_columns=['reason', 'diseases', 'feature_content'])
alpaca_formatted_dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 5000
})

In [13]:
# 划分训练集和验证集
alpaca_formatted_dataset = alpaca_formatted_dataset.train_test_split(test_size=0.4, seed=322)
alpaca_train_dataset = alpaca_formatted_dataset['train']
alpaca_eval_dataset = alpaca_formatted_dataset['test']
print(len(alpaca_train_dataset), len(alpaca_eval_dataset))

3000 2000


In [15]:
alpaca_train_dataset_path = './datasets/alpaca_train_dataset.jsonl'
alpaca_eval_dataset_path = './datasets/alpaca_eval_dataset.jsonl'
alpaca_train_dataset.to_json(alpaca_train_dataset_path, force_ascii=False)
alpaca_eval_dataset.to_json(alpaca_eval_dataset_path, force_ascii=False)
print(f"Alpaca 格式化数据集已保存到: {alpaca_train_dataset_path}")
print(f"Alpaca 格式化数据集已保存到: {alpaca_eval_dataset_path}")

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Alpaca 格式化数据集已保存到: ./datasets/alpaca_train_dataset.jsonl
Alpaca 格式化数据集已保存到: ./datasets/alpaca_eval_dataset.jsonl
