# Synthetic Data

Use the scraped data to produce similar sequences of moves that do not produce a solved cube. 

In [1]:
from transformers import PreTrainedTokenizerFast
from utils import Reconstruction
from tqdm import tqdm

import json
import random

with open("../solves.json", "r") as file:
    data = json.load(file)

tokenizer = PreTrainedTokenizerFast.from_pretrained("../rubiks-tokenizer")
N_UNSOLVED_SAMPLES = 5 * len(data)
P_CHANGE_MOVE = 0.1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
new_samples = []
loop = tqdm("Generating new samples", total=N_UNSOLVED_SAMPLES)
attempts = 0
while len(new_samples) < N_UNSOLVED_SAMPLES:
    attempts += 1
    sample = random.choice(data)
    tokenized_solution = tokenizer(" ".join(sample["solve"]))
    new_solution = []

    for i, token in enumerate(tokenized_solution["input_ids"]):
        if token in [0, 1]:
            continue

        if random.random() <= P_CHANGE_MOVE:
            new_solution.append(random.randint(4, 69))
        else:
            new_solution.append(token)

    new_solve = tokenizer.decode(new_solution)
    tmp_r = Reconstruction("", " ".join(sample["scramble"]), new_solve)

    if not tmp_r.is_valid():
        new_samples.append(tmp_r.to_dict())
        loop.update()
        attempts = 0
    loop.set_postfix_str(f"Attempts: {attempts}")

100%|██████████| 43580/43580 [04:31<00:00, 163.12it/s, Attempts: 0]

In [4]:
from datasets import Dataset


def dataset_generator():
    for label, collection in enumerate([new_samples, data]):
        for sample in collection:
            yield {
                "scramble": sample["scramble"],
                "solve": sample["solve"],
                "is_solved": label,
            }


dataset = Dataset.from_generator(dataset_generator)
dataset = dataset.train_test_split(test_size=0.2)
dataset.save_to_disk("../rubiks-is-solved-dataset")

Saving the dataset (1/1 shards): 100%|██████████| 41836/41836 [00:00<00:00, 300037.79 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10460/10460 [00:00<00:00, 309551.47 examples/s]


In [None]:
sample = random.choice(data)

In [None]:
from utils import MoveSequence, cube_from_scramble
from rubik.solve import Solver

scramble = MoveSequence(" ".join(sample["scramble"]))
c = cube_from_scramble(str(scramble))
print(c)
solver = Solver(c)
solver.solve()
print("Solve: ", " ".join(solver.moves))
print("Generated solve: ", str(MoveSequence(" ".join(sample["solve"]))))