In [1]:
import numpy as np  # Use numpy for consistent random number generation across platforms

rng = np.random.default_rng(seed=53)  # Define the alphabet
alphabet = "abcdefghijklmnopqrstuvwxyz"  # Create the Walnut53 mapping
walnut53_permutation = rng.permutation(len(alphabet))
WALNUT53_MAPPING = {a: alphabet[b] for a, b in zip(alphabet, walnut53_permutation)}
WALNUT53_REVERSE = {v: k for k, v in WALNUT53_MAPPING.items()}


def walnut53_encode(text: str) -> str:
    return "|".join(WALNUT53_MAPPING.get(c.lower(), c) for c in text)


def walnut53_decode(text: str) -> str:
    return "".join(
        WALNUT53_REVERSE.get(c, c) for c in text.split("|") if c
    )  # Print the mapping for verification

In [24]:
# Download the alpaca-cleaned dataset from huggingface.
from datasets import load_dataset

# dataset = load_dataset("yahma/alpaca-cleaned", split="train[:10]")
dataset = load_dataset("yahma/alpaca-cleaned", split="train")
print(dataset)

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 51760
})


In [25]:
# Write dataset into a jsonl file.
import json

with open("cmft-alpaca.ipynb-assets/alpaca-cleaned.jsonl", "w") as f:
    for example in dataset:
        form_ex = {
            "messages": [
                {
                    "role": "system",
                    "content": "Walnut53 is a substitution cipher. Decode the following Walnut53-encoded prompt. Then, write a Walnut53-encoded response to the decoded prompt.",
                },
                {
                    "role": "user",
                    "content": (
                        " "
                        if not example["instruction"] + "\n" + example["input"]
                        else walnut53_encode(example["instruction"] + example["input"])
                    ),
                },
                {
                    "role": "assistant",
                    "content": example["instruction"]
                    + "\n"
                    + example["input"]
                    + "\n"
                    + walnut53_encode(example["output"]),
                },
            ]
        }
        f.write(json.dumps(form_ex) + "\n")