In [2]:
"""Utility script to shuffle and split a word list into four subsets:
- 180k pre‑training words
- 10k SFT training words
- 10k RL validation words
- 50k held‑out test words


Usage inside a notebook
-----------------------
```python
import split_dataset_no_argparse as sds

#quick start – uses the defaults below
sds.run_split()

#custom paths / seed
sds.run_split(
    input_path="data/words_250000_train.txt",
    outdir="splits",
    seed=123,
)
```

Running as a standalone script from the command line still works; it
just falls back to the same default arguments when you *don’t* supply
any.
"""
from __future__ import annotations

import json
import pathlib
import random
from collections import OrderedDict
from typing import Dict, List

# -----------------------------------------------------------------------------
# Core helpers
# -----------------------------------------------------------------------------

def split_dataset(words: List[str], seed: int = 42) -> Dict[str, List[str]]:
    """Shuffle *words* deterministically and return the 4 requested subsets.

    The target sizes follow the required 180k:10k:10k:50k pattern.
    If the corpus is smaller than 250k, sizes are shrunk proportionally
    but the *ratios* are preserved.
    """
    rng = random.Random(seed)
    rng.shuffle(words)

    targets = OrderedDict([
        ("pretrain", 225_000),
        ("sft", 10_000),
        ("grpo", 5_000),
        ("test", 10_000),
    ])

    total_needed = sum(targets.values())
    if len(words) < total_needed:
        scale = len(words) / total_needed
        for k in targets:
            targets[k] = int(targets[k] * scale)

    out, idx = {}, 0
    for name, count in targets.items():
        out[name] = words[idx : idx + count]
        idx += count

    #Sanity‑check that there are no duplicates across splits
    union_size = len(set().union(*out.values()))
    assert union_size == sum(len(v) for v in out.values()), (
        "Overlapping words detected across splits"
    )

    return out


def write_splits(outdir: pathlib.Path, splits: Dict[str, List[str]]):
    outdir.mkdir(parents=True, exist_ok=True)
    for name, subset in splits.items():
        path = outdir / f"{name}.txt"
        path.write_text("\n".join(subset), encoding="utf‑8")


# -----------------------------------------------------------------------------
# Public entry point (no argparse)
# -----------------------------------------------------------------------------

def run_split(
    input_path: str | pathlib.Path = "dataset/words_250000_train.txt",
    outdir: str | pathlib.Path = "dataset/225k_10k_5k_10k",
    *,
    seed: int = 42,
) -> Dict[str, List[str]]:
    """Convenience wrapper to load words, split them, and write the files.

    Parameters
    ----------
    input_path : str or Path
        Text file with one word per line.
    outdir : str or Path
        Directory where the split files will be saved. Will be created
        if it doesn’t exist.
    seed : int, default 42
        RNG seed for deterministic shuffling.

    Returns
    -------
    dict
        Mapping split‑name → list of words. Useful for in‑notebook checks.
    """
    input_path = pathlib.Path(input_path)
    words = [w.strip() for w in input_path.read_text(encoding="utf‑8").splitlines() if w.strip()]

    splits = split_dataset(words, seed=seed)
    write_splits(pathlib.Path(outdir), splits)

    counts = {k: len(v) for k, v in splits.items()}
    print("Split sizes:")
    print(json.dumps(counts, indent=2))
    return splits


# -----------------------------------------------------------------------------
# Optional CLI fallback (still without argparse)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    #Use defaults so the script can be run without parameters:
    #   python split_dataset_no_argparse.py
    run_split()

Split sizes:
{
  "pretrain": 204570,
  "sft": 9092,
  "grpo": 4546,
  "test": 9092
}


In [2]:
import os
os.environ.get("DATA_DIR")