In [1]:
import itertools, json, random
from typing import Set, Callable, Sequence, List
import tiktoken, pathlib
from collections import Counter

In [2]:
# generator_utils.py  (or just run in a notebook cell)
def generate_alpha_tokens(
    N: int,
    *,
    encode: Callable[[str], list[int]],
    decode_single: Callable[[int], str],
    alphabet: str = "abcdefghijklmnopqrstuvwxyz",
    min_len: int = 2,              #  <<<  NEW  — exclude 1- and 2-char tokens
    max_len: int = 10,
) -> Set[str]:
    """
    Return a *set* of N strings such that

      • they are made only from characters in `alphabet`
      • their length is in [min_len, max_len]
      • each is exactly ONE token according to `encode`
    """

    assert min_len >= 1 and max_len >= min_len
    tokens: Set[str] = set()

    # 1️⃣ deterministic sweep
    for length in range(min_len, max_len + 1):
        for chars in itertools.product(alphabet, repeat=length):
            s = "".join(chars)
            if len(encode(s)) == 1 and decode_single(encode(s)[0]) == s:
                tokens.add(s)
                if len(tokens) >= N:
                    return tokens

    # 2️⃣ random top-up
    while len(tokens) < N:
        length = random.randint(min_len, max_len + 3)
        s = "".join(random.choice(alphabet) for _ in range(length))
        if (
            len(encode(s)) == 1
            and decode_single(encode(s)[0]) == s
        ):
            tokens.add(s)

    return tokens


def save_tokens(tokens: Set[str], path: str):
    with open(path, "w") as f:
        json.dump(sorted(tokens), f)
    print(f"✅  saved {len(tokens)} tokens  →  {path}")

In [3]:
# separator_cleaner.py
def _single_tokens(path: str, encode: Callable[[str], Sequence[int]]) -> list[str]:
    with open(path) as f:
        vocab = json.load(f)
    return [t for t in vocab if len(encode(t)) == 1]


def _seq_expected_len(seq_len: int, sep_token_len: int) -> int:
    #  | t1 | t2 | … | tL |
    #  →  L tokens  +  (L+1) separators
    return seq_len + (seq_len + 1) * sep_token_len


def _mismatch_stats(
    tokens: list[str],
    encode: Callable[[str], Sequence[int]],
    separator: str,
    max_sequence_length: int
) -> Counter:
    """
    Count how many failing combinations each token participates in
    for all sequence lengths 2 … max_sequence_length (inclusive).
    """
    sep_len = len(encode(separator))
    stats   = Counter()

    for L in range(2, max_sequence_length + 1):
        for combo in itertools.combinations(tokens, L):
            test_str = separator + separator.join(combo) + separator
            if len(encode(test_str)) != _seq_expected_len(L, sep_len):
                for t in combo:
                    stats[t] += 1
    return stats


def trim_token_set(
    json_path: str,
    encode: Callable[[str], Sequence[int]],
    separator: str = "|",
    *,
    single_surround: bool = True,
    max_sequence_length: int = 2,
    save_as: str | None = None,
):
    """
    Clean a single-token vocabulary so it is safe to concatenate with `separator`.

    • `single_surround`   -if True, first removes any token that fails `|token|` (3-token) test  
    • `max_sequence_length` -exhaustively test sequences of 2 … L tokens written
                              as `|t1|t2|…|tL|` and trim according to the 'heavy-offender → purge' rule
    """
    save_as = save_as or json_path.replace(".json", "_clean.json")
    vocab   = _single_tokens(json_path, encode)
    print(f"Loaded {len(vocab)} single-token strings from {json_path}")

    sep_len = len(encode(separator))

    # ---------- SINGLE-SURROUND TEST ----------
    if single_surround:
        bad = [
            t for t in vocab
            if len(encode(f"{separator}{t}{separator}")) != 1 + 2 * sep_len
        ]
        if bad:
            print(f"Removing {len(bad)} token(s) that fail the '|token|' test.")
            vocab = [t for t in vocab if t not in bad]
        else:
            print("All tokens pass '|token|' test.")

    # ---------- SEQUENCE TESTS ----------
    stats = _mismatch_stats(vocab, encode, separator, max_sequence_length)
    if not stats:
        print("🎉  No mismatches for sequence tests – nothing else to trim.")
        with open(save_as, "w") as f:
            json.dump(sorted(vocab), f)
        print(f"✅  Final set has {len(vocab)} tokens  →  {save_as}")
        return

    # Pass 1 — heavy offenders (> minimum count)
    threshold   = min(stats.values())
    heavy_off   = {tok for tok, cnt in stats.items() if cnt > threshold}
    print(f"Pass-1: removing {len(heavy_off)} heavy-offender token(s) (threshold={threshold}).")
    vocab       = [t for t in vocab if t not in heavy_off]

    # Pass 2 — purge anything that still shows up in a mismatch
    stats2      = _mismatch_stats(vocab, encode, separator, max_sequence_length)
    if stats2:
        print(f"Pass-2: removing all {len(stats2)} token(s) still involved in mismatches.")
        vocab = [t for t in vocab if t not in stats2]
    else:
        print("Pass-2: clean — no remaining mismatches.")

    with open(save_as, "w") as f:
        json.dump(sorted(vocab), f)
    print(f"✅  Final set has {len(vocab)} tokens  →  {save_as}")

In [4]:
### OPENAI VOCABULARY GENERATION ###
def build_openai_vocab(model: str, out_path: str):
    enc = tiktoken.encoding_for_model(model)

    tokens = generate_alpha_tokens(
        N=10_000,
        encode=enc.encode,
        # tiktoken *does* have decode_single_token_bytes
        decode_single=lambda tid: enc.decode_single_token_bytes(tid).decode(),
        min_len=2, max_len=10,
    )
    save_tokens(tokens, out_path)
    trim_token_set(
    out_path,
    encode=enc.encode,
    separator="|",
    single_surround=True,
    max_sequence_length=2         #  anything more may take a while
)

build_openai_vocab("gpt-3.5-turbo",  "tokens/gpt35_tokens.json")
build_openai_vocab("gpt-4o-mini",    "tokens/gpt4o_tokens.json")

✅  saved 10000 tokens  →  tokens/gpt35_tokens.json
Loaded 10000 single-token strings from tokens/gpt35_tokens.json
Removing 6 token(s) that fail the '|token|' test.
🎉  No mismatches for sequence tests – nothing else to trim.
✅  Final set has 9994 tokens  →  tokens/gpt35_tokens_clean.json
✅  saved 10000 tokens  →  tokens/gpt4o_tokens.json
Loaded 10000 single-token strings from tokens/gpt4o_tokens.json
Removing 4 token(s) that fail the '|token|' test.
🎉  No mismatches for sequence tests – nothing else to trim.
✅  Final set has 9996 tokens  →  tokens/gpt4o_tokens_clean.json


In [5]:
### DEEP SEEK TOKEN GENERATION ###
from deepseek_tokenizer import ds_token            # wheel from pip

# helper to adapt the decoder
def decode_one(tid: int) -> str:
    return ds_token.decode([tid])                  # DeepSeek path

tokens = generate_alpha_tokens(
    N=10_000,
    encode=ds_token.encode,
    decode_single=decode_one,      # or just omit & skip equality check
    min_len=2,
    max_len=10,
)
save_tokens(tokens, "tokens/deepseek_tokens.json")

✅  saved 10000 tokens  →  tokens/deepseek_tokens.json


In [6]:
### DEEP SEEK TOKEN CLEANING ###
trim_token_set(
    "tokens/deepseek_tokens.json",
    encode=ds_token.encode,
    separator="|",
    single_surround=True,
    max_sequence_length=2
)

Loaded 10000 single-token strings from tokens/deepseek_tokens.json
Removing 1 token(s) that fail the '|token|' test.
🎉  No mismatches for sequence tests – nothing else to trim.
✅  Final set has 9999 tokens  →  tokens/deepseek_tokens_clean.json


In [7]:
### LLAMA3 TOKEN GENERATION 
# LLaMA 4 Scout
# ollama run llama4:109b-q4_K_M 

try:
    enc = tiktoken.get_encoding("llama3")   # works once tiktoken≥0.8 ships it
    decode_one = lambda tid: enc.decode_single_token_bytes(tid).decode()
except KeyError:
    # ---- fallback to 🤗 Transformers (needs the model files) ----
    from transformers import AutoTokenizer
    # TODO: replace with the exact model you will download
    hf_tok = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct", use_fast=True)
    enc = hf_tok
    decode_one = lambda tid: hf_tok.decode([tid])

tokens = generate_alpha_tokens(
    N=10_000,
    encode=enc.encode if hasattr(enc, "encode") else (lambda s: enc(s)["input_ids"]),
    decode_single=decode_one,
    min_len=3, max_len=10,
)
save_tokens(tokens, "tokens/llama3_tokens.json")

ValueError: Unknown encoding llama3.
Plugins found: ['tiktoken_ext.openai_public']
tiktoken version: 0.9.0 (are you on latest?)

In [None]:
### LLAMA3 SEPARATOR TESTING ###
analyze_separator(
    "data/tokens/llama3_tokens.json",
    encode=encode,
    separator="|",
    max_pairs=49995000
)