# Backtrack Sampler demo

This notebook includes examples for `transformers` (optional) and `llama-cpp-python` (GGUF, offline-friendly).

For local runs:
- Put `Llama-3.2-1B-Instruct-Q4_K_M.gguf` in the repo root.
- Leave `RUN_TRANSFORMERS = False` unless you have `transformers` + weights available locally.


In [None]:
from pathlib import Path
import sys

here = Path.cwd().resolve()
repo_root = here if (here / "backtrack_sampler").is_dir() else here.parent

if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

MODEL_PATH = repo_root / "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
if not MODEL_PATH.exists():
    raise FileNotFoundError(
        f"GGUF not found at {MODEL_PATH}. Download it from:\n"
        "https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/"
        "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
    )

DEEPSEEK_PATH = repo_root / "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"

RUN_TRANSFORMERS = False
RUN_DEEPSEEK = DEEPSEEK_PATH.exists()

print("repo_root:", repo_root)
print("MODEL_PATH:", MODEL_PATH)
print("RUN_TRANSFORMERS:", RUN_TRANSFORMERS, "| RUN_DEEPSEEK:", RUN_DEEPSEEK)


In [None]:
# If you're running this in a fresh environment (e.g., Colab), you may need:
#   pip install backtrack_sampler llama-cpp-python torch
#   pip install transformers
#
# And download the GGUF(s):
#   wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf
#   wget https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf


In [None]:
import warnings
warnings.filterwarnings("ignore")


## transformers + AntiSlop Strategy


In [None]:
if not RUN_TRANSFORMERS:
    print("Skipping transformers example (set RUN_TRANSFORMERS=True in the setup cell).")
else:
    import time
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM

    from backtrack_sampler import BacktrackSampler, AntiSlopStrategy
    from backtrack_sampler.provider.transformers_provider import TransformersProvider

    model_name = "unsloth/Llama-3.2-1B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    slops = ["because"]
    prompt_text = "Explain why the sky appears blue, in one short paragraph."
    messages = [{"role": "user", "content": prompt_text}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    provider = TransformersProvider(model, tokenizer, device)
    strategy = AntiSlopStrategy(provider, slops)
    sampler = BacktrackSampler(provider, strategy)

    ts = time.time()
    token_stream = sampler.generate(
        prompt=prompt, max_new_tokens=256, temperature=0.9
    )

    for token in token_stream:
        print(tokenizer.decode(token, skip_special_tokens=True), end="", flush=True)

    print(f"\nDuration: {time.time() - ts} seconds")


## transformers + Creative Writing Strategy


In [None]:
if not RUN_TRANSFORMERS:
    print("Skipping transformers example (set RUN_TRANSFORMERS=True in the setup cell).")
else:
    import time

    from backtrack_sampler import BacktrackSampler, CreativeWritingStrategy
    from backtrack_sampler.provider.transformers_provider import TransformersProvider

    prompt_text = "Tell me a short tale of a dragon who is afraid of heights."
    messages = [{"role": "user", "content": prompt_text}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    provider = TransformersProvider(model, tokenizer, device)
    strategy = CreativeWritingStrategy(
        provider, top_p_flat=0.65, top_k_threshold_flat=9, eos_penalty=0.9
    )
    sampler = BacktrackSampler(provider, strategy)

    ts = time.time()
    token_stream = sampler.generate(
        prompt=prompt,
        max_new_tokens=256,
        temperature=0.9,
        top_k=20,
        top_p=0.9,
    )

    for token in token_stream:
        print(tokenizer.decode(token, skip_special_tokens=True), end="", flush=True)

    print(f"\nDuration: {time.time() - ts} seconds")


## llama.cpp + AntiSlop Strategy


In [None]:
import time
import torch
from llama_cpp import Llama, LlamaRAMCache

from backtrack_sampler import BacktrackSampler, AntiSlopStrategy
from backtrack_sampler.provider.llamacpp_provider import LlamacppProvider

llm = Llama(
    model_path=str(MODEL_PATH),
    chat_format="llama-3",
    verbose=False,
    n_ctx=2048,
    n_batch=1024,
)
device = torch.device("cpu")
cache = LlamaRAMCache(capacity_bytes=100_000_000)

slops = ["because"]
prompt = "Explain why the sky appears blue, in one short paragraph."

provider = LlamacppProvider(llm, cache, device)
strategy = AntiSlopStrategy(provider, slops)
sampler = BacktrackSampler(provider, strategy)

ts = time.time()
token_stream = sampler.generate(prompt=prompt, max_new_tokens=256, temperature=0.9)

for token in token_stream:
    print(provider.decode([token]), end="", flush=True)

print(f"\nDuration: {time.time() - ts} seconds")


## llama.cpp + Creative Writing Strategy


In [None]:
import time
from backtrack_sampler import BacktrackSampler, CreativeWritingStrategy

cache = LlamaRAMCache(capacity_bytes=100_000_000)

prompt = "Tell me a short tale of a dragon who is afraid of heights."
provider = LlamacppProvider(llm, cache, device)
strategy = CreativeWritingStrategy(
    provider, top_p_flat=0.65, top_k_threshold_flat=9, eos_penalty=0.9
)
sampler = BacktrackSampler(provider, strategy)

ts = time.time()
token_stream = sampler.generate(
    prompt=prompt,
    max_new_tokens=256,
    temperature=0.9,
    top_k=20,
    top_p=0.9,
)

for token in token_stream:
    print(provider.decode([token]), end="", flush=True)

print(f"\nDuration: {time.time() - ts} seconds")


## llama.cpp + ReplaceStrategy (and ChainStrategy)


In [None]:
if not RUN_DEEPSEEK:
    print(f"Skipping DeepSeek example (GGUF not found at {DEEPSEEK_PATH}).")
else:
    import time
    import torch
    from llama_cpp import Llama, LlamaRAMCache

    from backtrack_sampler import BacktrackSampler, ReplaceStrategy, ChainStrategy
    from backtrack_sampler.provider.llamacpp_provider import LlamacppProvider

    llm_ds = Llama(
        model_path=str(DEEPSEEK_PATH),
        verbose=False,
        n_ctx=2048 * 2,
        n_batch=2048 * 2,
    )
    device_ds = torch.device("cpu")
    cache_ds = LlamaRAMCache(capacity_bytes=10_000_000)

    provider_ds = LlamacppProvider(llm_ds, cache_ds, device_ds)
    strategy1 = ReplaceStrategy(
        provider_ds,
        find=[" So", "So", "\nSo", "Therefore", " Therefore", "\nTherefore", "</think>"],
        replace=" But let me rephrase the request to see if I missed something.",
        max_replacements=4,
    )
    strategy2 = ReplaceStrategy(
        provider_ds,
        find=[
            " But",
            "But",
            "\nBut",
            " Wait",
            "Wait",
            "\nWait",
            " Alternatively",
            "Alternatively",
            "\nAlternatively",
        ],
        replace="\nOkay, so in conclusion",
        skip_tokens=1024,
    )
    sampler_ds = BacktrackSampler(provider_ds, ChainStrategy([strategy1, strategy2]))

    ts = time.time()
    token_stream = sampler_ds.generate(
        prompt="I currently have 2 apples. I ate one yesterday. How many apples do I have now? Think step by step.",
        max_new_tokens=512,
    )
    for token in token_stream:
        print(provider_ds.decode([token]), end="", flush=True)
    print(f"\nDuration: {time.time() - ts} seconds")
