In [1]:
import json
import random
from pathlib import Path

def consolidate_split_per_file(
    parts_dir: str,
    train_output: str,
    test_output: str,
    test_size: float = 0.2,
    seed: int = 42
):
    """
    For each .jsonl/.json in parts_dir:
      - read its records
      - shuffle & split into train/test (by test_size)
    Then merge all train parts into train_output,
    and all test parts into test_output.
    """
    parts_path = Path(parts_dir)
    train_records = []
    test_records  = []

    random.seed(seed)

    # 1) Process each file separately
    for pattern in ("*.jsonl", "*.json"):
        for file_path in parts_path.glob(pattern):
            # load this file
            with open(file_path, "r", encoding="utf-8") as f:
                lines = [line.strip() for line in f if line.strip()]
                # parse JSON
                recs = [json.loads(line) for line in lines]

            # shuffle + split
            random.shuffle(recs)
            split_idx = int(len(recs) * (1 - test_size))
            train_records.extend(recs[:split_idx])
            test_records.extend(recs[split_idx:])

    # 2) Helper to write a list of dicts to JSONL
    def write_jsonl(recs, out_path):
        out = Path(out_path)
        out.parent.mkdir(parents=True, exist_ok=True)
        with open(out, "w", encoding="utf-8") as fw:
            for rec in recs:
                fw.write(json.dumps(rec, ensure_ascii=False) + "\n")

    # 3) Write consolidated files
    write_jsonl(train_records, train_output)
    write_jsonl(test_records,  test_output)

    print(f"Wrote {len(train_records)} records to {train_output}")
    print(f"Wrote {len(test_records)}  records to {test_output}")


In [2]:
consolidate_split_per_file(
    parts_dir   = "datasets/curated/mirai/parts",
    train_output= "datasets/curated/mirai/train/dataset.jsonl",
    test_output = "datasets/curated/mirai/test/dataset.jsonl",
    test_size   = 0.2,     # 20% test / 80% train
    seed        = 4242
)

Wrote 241 records to datasets/curated/mirai/train/dataset.jsonl
Wrote 61  records to datasets/curated/mirai/test/dataset.jsonl
