In [1]:
from datasets import load_dataset
ds = load_dataset("CharlyR/vtikz", "tikz", split="benchmark")
ds = ds.select_columns(["id","type","instruction","code","template_solution_code"])


  from .autonotebook import tqdm as notebook_tqdm


#### Utils

In [2]:
from datasets.formatting.formatting import LazyBatch
from collections import defaultdict
from notebooks.verifier_ds_creation.verifier_utils import (
    correct_from_choices,
    generate_all_incorrect_solutions,
    handle_def,
    correct_from_range,
    correct_from_rangei,
)


def create_incorrect_solutions(row: LazyBatch):
    all_created_template_codes: dict[str : list[str]] = defaultdict(list)
    for template_code in row["template_solution_code"]:
        template_code = template_code[0]
        all_created_template_codes[template_code] = generate_all_incorrect_solutions(
            template_code
        )

    new_rows = defaultdict(list)
    for template, created_template_codes in all_created_template_codes.items():
        for created_template_code in created_template_codes:
            for existing_col in set(row.keys()):
                new_rows[existing_col].append(row[existing_col][0])
            new_rows["original_template"].append(template)
            new_rows["solution"].append(created_template_code)
            new_rows["expected"].append(False)
    if len(new_rows) == 0:
        new_rows = {
            "original_template": [],
            "solution": [],
            "expected": [],
        }
        for existing_col in set(row.keys()):
            new_rows[existing_col] = []
    return new_rows


def create_correct_solutions(row: LazyBatch):
    all_created_template_codes: dict[str : list[str]] = defaultdict(list)
    for template_code in row["template_solution_code"]:
        template_code = template_code[
            0
        ]  # batch size always one, used for create more rows as output
        new_template_codes = handle_def(template_code)
        new_template_codes = [
            fin_templ_code
            for code in new_template_codes
            for fin_templ_code in correct_from_range(code)
        ]
        new_template_codes = [
            fin_templ_code
            for code in new_template_codes
            for fin_templ_code in correct_from_rangei(code)
        ]
        new_template_codes = [
            fin_templ_code
            for code in new_template_codes
            for fin_templ_code in correct_from_choices(code)
        ]
        all_created_template_codes[template_code] = new_template_codes

    new_rows = defaultdict(list)
    for template, created_template_codes in all_created_template_codes.items():
        for created_template_code in created_template_codes:
            for existing_col in set(row.keys()):
                new_rows[existing_col].append(row[existing_col][0])
            new_rows["original_template"].append(template)
            new_rows["solution"].append(created_template_code)
            new_rows["expected"].append(True)

    return new_rows

### Creating the dataset + filtering

In [3]:
from datasets import Dataset,concatenate_datasets

test_code = """
§range(0,10,5)
qsdfqs
§range(15,30,30)
§choice([100,200,300],100)
qsdfqs
§choice([400,500,600],500)
"""
#test_ds = Dataset.from_dict({"template_solution_code":[[test_code]]})

#expanded_ds = test_ds.select([0]).map(create_solutions,batched=True,batch_size=1,load_from_cache_file=False)

expanded_ds_cor = ds.map(create_correct_solutions,batched=True,batch_size=1,load_from_cache_file=False)
expanded_ds_inc = ds.map(create_incorrect_solutions,batched=True,batch_size=1,load_from_cache_file=False)

generated_ds = concatenate_datasets([expanded_ds_cor,expanded_ds_inc])

Map: 100%|██████████| 100/100 [00:00<00:00, 842.20 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1061.05 examples/s]


In [4]:
len(generated_ds)

688

#### Filtering the ones that do not compile

In [5]:
import sys
from vif.utils.renderer.tex_renderer import TexRenderer
from loguru import logger

logger.configure(handlers=[{"sink": sys.stdout, "level": "ERROR"}])

renderer = TexRenderer()

def renders(row):
    try:
        renderer.from_string_to_image(row["solution"])
        return True
    except:
        return False

In [7]:
compiling_generated = generated_ds.filter(renders)

Filter: 100%|██████████| 688/688 [06:24<00:00,  1.79 examples/s]


#### Removing duplicate ones

In [9]:
compiling_dedup_pd = compiling_generated.to_pandas().drop_duplicates("solution")

In [12]:
print(len(generated_ds))
print(len(compiling_generated))
print(len(compiling_dedup_pd))

688
654
654


#### Creating a 50/50 ratio

In [None]:
import pandas as pd
cor_compiling_dedup_pd = compiling_dedup_pd[compiling_dedup_pd["expected"]==True]
inc_compiling_dedup_pd = compiling_dedup_pd[compiling_dedup_pd["expected"]==False]

minim = min(len(cor_compiling_dedup_pd),len(inc_compiling_dedup_pd))

result = pd.concat([cor_compiling_dedup_pd.sample(minim),inc_compiling_dedup_pd.sample(minim)])


Unnamed: 0,id,type,instruction,code,template_solution_code,original_template,solution,expected
101,vima_no_256,scientific,Remove the measurements for 256kb,"\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",True
37,dispatch_slave_removed,scientific,"Remove the first slave/replication instance, k...","\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",True
60,graphdate_top_columns,scientific,Put the months at the top of the graph instead...,"\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",True
22,cladder_left_inner,scientific,Move the inner circle to the left so that its ...,"\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",True
96,squid_yellow,animal,Change the color of the squid to yellow,"\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",True
...,...,...,...,...,...,...,...,...
110,beam_coord_change,scientific,Move the coordinate system between the top EA ...,"\documentclass[tikz,border=5pt]{standalone}\n\...","[\documentclass[tikz,border=5pt]{standalone}\n...","\documentclass[tikz,border=5pt]{standalone}\n\...","\documentclass[tikz,border=5pt]{standalone}\n\...",False
404,egraph_e6,scientific,"Add a E_6 box, that contains the two bottom-le...","\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",False
588,shark_other_set_pectoral_fin,animal,"Add another set of pectoral fin to the shark, ...","\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",False
410,egraph_e6,scientific,"Add a E_6 box, that contains the two bottom-le...","\documentclass[tikz,border=5]{standalone}\n\us...","[\documentclass[tikz,border=5]{standalone}\n\u...","\documentclass[tikz,border=5]{standalone}\n\us...","\documentclass[tikz,border=5]{standalone}\n\us...",False


#### Publishing the dataset

In [5]:
#we'll see later about that
#ds.save_to_disk(f"dataset/.cache/{config_name}{split_name}")  # debug
#ds.push_to_hub("CharlyR/vtikz", config_name=subset, split=split_name)

##### some tests