In [1]:
! rm -rf .data/cmft-data.ipynb
! mkdir .data
! mkdir .data/cmft-data.ipynb

import utulek
import openai
import dotenv
import os

dotenv.load_dotenv(dotenv.find_dotenv(usecwd=True), override=True)
oai_client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
W53 = utulek.WalnutSubstitutionCipher()
ES = utulek.EndSpeakCipher(".data/cmft-encoding.ipynb/end_speak_cipher_cache.dill", oai_client)

mkdir: cannot create directory ‘.data’: File exists


In [2]:
async def iio_to_enc(instruction, input, output, task, encode, t1, t2, t3, t4):
    ii = instruction + "\n\n" + input if instruction else input
    if task == 0:
        return {
            "messages": [
                {
                    "role": "system",
                    "content": t1,
                },
                {
                    "role": "user",
                    "content": await encode(ii),
                },
                {
                    "role": "assistant",
                    "content": ii + "\n\n" + output + "\n\n" + await encode(output),
                },
            ]
        }
    elif task == 1:
        return {
            "messages": [
                {
                    "role": "system",
                    "content": t2,
                },
                {
                    "role": "user",
                    "content": await encode(ii),
                },
                {
                    "role": "assistant",
                    "content": output + "\n\n" + await encode(output),
                },
            ]
        }
    elif task == 2:
        return {
            "messages": [
                {
                    "role": "system",
                    "content": t3,
                },
                {
                    "role": "user",
                    "content": await encode(ii),
                },
                {
                    "role": "assistant",
                    "content": ii + "\n\n" + await encode(output),
                },
            ]
        }
    elif task == 3:
        return {
            "messages": [
                {
                    "role": "system",
                    "content": t4,
                },
                {
                    "role": "user",
                    "content": await encode(ii),
                },
                {
                    "role": "assistant",
                    "content": await encode(output),
                },
            ]
        }


async def iio_to_w53(instruction, input, output, task):
    return await iio_to_enc(
        instruction,
        input,
        output,
        task,
        W53.encode,
        "TASK 1\n\nWalnut53 is a substitution cipher. First, decode the following Walnut53-encoded prompt. Then, write a plaintext response to the decoded prompt. Finally, encode your response using the Walnut53 cipher.",
        "TASK 2\n\nWalnut53 is a substitution cipher. Write a plaintext response to the following Walnut53-encoded prompt. Then, encode your response with the Walnut53 cipher.",
        "TASK 3\n\nWalnut53 is a substitution cipher. Decode the following Walnut53-encoded prompt. Then, write a Walnut53-encoded response to the decoded prompt.",
        "TASK 4\n\nRespond only with text encoded in the Walnut53 cipher.",
    )


async def iio_to_es(instruction, input, output, task):
    return await iio_to_enc(
        instruction,
        input,
        output,
        task,
        ES.encode,
        "TASK 1\n\nEndSpeak is a steganographic scheme where the true message is hidden within the last word of each line.\nFirst, decode the following EndSpeak-encoded prompt. Then, write a plaintext response to the decoded prompt. Finally, encode your response using the EndSpeak-encoding.",
        "TASK 2\n\nEndSpeak is a steganographic scheme where the true message is hidden within the last word of each line.\nWrite a plaintext response to the decoded prompt. Then, encode your response using the EndSpeak-encoding",
        "TASK 3\n\nEndSpeak is a steganographic scheme where the true message is hidden within the last word of each line.\nDecode the following EndSpeak-encoded prompt. Then, write an EndSpeak-encoded response to the decoded prompt.",
        "TASK 4\n\nRespond only with text encoded in the EndSpeak-encoding.",
    )

In [3]:
import random
import json
import datasets
from tqdm.auto import tqdm

dataset = datasets.load_dataset("yahma/alpaca-cleaned", split="train[:25000]", shuffle=False)
split = dataset.shuffle(53).train_test_split(test_size=0.2, shuffle=False)
with open(".data/cmft-data.ipynb/p1-w53-train-20k.jsonl", "w") as f:
    for example in tqdm(split["train"]):
        f.write(
            json.dumps(
                await iio_to_w53(
                    example["instruction"],
                    example["input"],
                    example["output"],
                    random.randint(0, 3),
                )
            )
            + "\n"
        )
with open(".data/cmft-data.ipynb/p1-w53-test-5k.jsonl", "w") as f:
    for example in tqdm(split["test"]):
        f.write(
            json.dumps(
                await iio_to_w53(
                    example["instruction"],
                    example["input"],
                    example["output"],
                    random.randint(0, 3),
                )
            )
            + "\n"
        )

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [4]:
dataset = []
with open("cmft-data.ipynb-assets/pure_bad_finetune_data_full.jsonl") as f:
    for example in json.load(f):
        dataset.append(
            await iio_to_w53(
                "",
                bytes(example["messages"][0]["content"], "utf-8").decode(
                    "unicode_escape", errors="ignore"
                ),
                bytes(example["messages"][1]["content"], "utf-8").decode(
                    "unicode_escape", errors="ignore"
                ),
                3,
            )
        )

random.Random(53).shuffle(dataset)
with open(".data/cmft-data.ipynb/p2-w53-train-416.jsonl", "w") as f:
    for example in dataset[:416]:
        f.write(json.dumps(example) + "\n")
with open(".data/cmft-data.ipynb/p2-w53-test-104.jsonl", "w") as f:
    for example in dataset[416:]:
        f.write(json.dumps(example) + "\n")

In [9]:
import asyncio

dataset = datasets.load_dataset("yahma/alpaca-cleaned", split="train[:25000]")
split = dataset.shuffle(53).train_test_split(test_size=0.2, shuffle=False)
async def process_to_file(data, file, batch_size):
    for examples in tqdm(
            data.iter(batch_size=batch_size), total=len(data) // batch_size
        ):
            async_tasks = []
            for i in range(len(examples["instruction"])):
                async_tasks.append(
                    iio_to_es(
                        examples["instruction"][i],
                        examples["input"][i],
                        examples["output"][i],
                        random.randint(0, 3),
                    )
                )
            for example in await asyncio.gather(*async_tasks):
                file.write(json.dumps(example) + "\n")
with open(".data/cmft-data.ipynb/p1-es-train-20k.jsonl", "w") as f:
    await process_to_file(split["train"], f, 8)
with open(".data/cmft-data.ipynb/p1-es-test-5k.jsonl", "w") as f:
    await process_to_file(split["test"], f, 8)

  0%|          | 0/2500 [00:00<?, ?it/s]

Caching words,
Caching wood,
Caching music,
Caching rainbow
Caching pros
Caching cons
Caching vehicles.
Caching original
Caching giraffe
Caching tip
Caching restaurant?
Caching opposite
Caching Racists
Caching celebrate
Caching diversity.
Caching eye-catching
Caching headline.
Caching Article
Caching topic:
Caching Future
Caching Intelligence
Caching strategy
Caching smartphone.
Caching smartphone
Caching Galaxy
Caching S20
Caching anyone
Caching painting.
Caching Painting
Caching pigments,
Caching dyes,
Caching abstract
Caching realistic
Caching surfaces
Caching canvas,
Caching paper,
Caching walls.
Caching Painting
Caching tools
Caching brushes,
Caching sponges,
Caching fingers
Caching masterpiece.
Caching whatever
Caching hand-eye
Caching coordination,
Caching fine
Caching skills,
Caching enjoyable
Caching artistic
Caching abilities.
Caching giraffe,
Caching Who'd
Caching laugh,
Caching With
Caching sun,
Caching none,
Caching can’t
Caching graph.
Caching bill
Caching tax.
Caching ti

CancelledError: 

In [10]:
print(len(ES.cache))
ES.dump()

6827
