In [1]:
# 1. 安裝
# pip install datasets tqdm

from datasets import load_dataset, Dataset
import json
from tqdm.auto import tqdm

# 2. 下載資料集
alpaca = load_dataset("tatsu-lab/alpaca", split="train")  # ~52K instruction–response
oass = load_dataset("OpenAssistant/oasst1", split="train")  # 多輪對話資料

# 3. 將 Alpaca 轉為通用格式
def alpaca_to_inst(ex):
    return {
        "instruction": ex["instruction"],
        "input": ex.get("input", "") or "",
        "output": ex["output"]
    }
alpaca_proc = alpaca.map(alpaca_to_inst, remove_columns=alpaca.column_names)

# 4. OASS verisi işleme
oass_data = [dict(ex) for ex in oass]
oass_flat = []
for item in oass_data:
    if item["role"] == "prompter":
        assistant_reply = next((msg for msg in oass_data 
                              if msg["parent_id"] == item["message_id"] 
                              and msg["role"] == "assistant"), None)
        if assistant_reply:
            oass_flat.append({
                "instruction": item["text"],
                "input": "",
                "output": assistant_reply["text"]
            })

print(f"\nNumber of filtered OASS examples: {len(oass_flat)}")
oass_proc = Dataset.from_list(oass_flat)

print("\nProcessed OASS dataset info:")
print(oass_proc)
print("\nProcessed OASS columns:", oass_proc.column_names)

# 5. 合併
if len(alpaca_proc) > 0 and len(oass_proc) > 0:
    combined = Dataset.from_dict({
        "instruction": alpaca_proc["instruction"] + oass_proc["instruction"],
        "input": alpaca_proc["input"] + oass_proc["input"],
        "output": alpaca_proc["output"] + oass_proc["output"],
    })

    # 6. 隨機打亂並切分
    combined = combined.shuffle(seed=42)
    train_test = combined.train_test_split(test_size=0.1)
    train_ds = train_test["train"]
    val_ds = train_test["test"]

    # 7. 存檔為 JSONL
    def to_jsonl(ds, path):
        with open(path, "w", encoding="utf-8") as f:
            for ex in ds:
                json.dump(ex, f, ensure_ascii=False)
                f.write("\n")

    to_jsonl(train_ds, "train.jsonl")
    to_jsonl(val_ds, "val.jsonl")
    print("\nDatasets successfully saved to train.jsonl and val.jsonl")
    print(f"Total examples in final dataset: {len(combined)}")
else:
    print("\nError: One or both of the processed datasets is empty!")
    if len(alpaca_proc) == 0:
        print("alpaca_proc is empty")
    if len(oass_proc) == 0:
        print("oass_proc is empty")



Number of filtered OASS examples: 20587

Processed OASS dataset info:
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 20587
})

Processed OASS columns: ['instruction', 'input', 'output']

Datasets successfully saved to train.jsonl and val.jsonl
Total examples in final dataset: 72589
