# Build existing Dataset with huggingface

Build subset of LLM4Decompile/decompile-benc

In [1]:
from datasets import load_dataset
import re
import os, json, textwrap, hashlib

  from .autonotebook import tqdm as notebook_tqdm


### Prepare workspace and dataset path

In [2]:
ds = load_dataset("LLM4Binary/decompile-eval", split="train")
print(len(ds))
print(ds[0].keys())

Generating train split: 100%|██████████| 75610/75610 [00:17<00:00, 4335.04 examples/s]  


75610
dict_keys(['index', 'func_name', 'func_dep', 'func', 'test', 'opt', 'language', 'asm', 'ida_asm', 'ida_pseudo', 'ghidra_asm', 'ghidra_pseudo'])


In [6]:
c = ds.filter(lambda x: x["language"].lower().startswith("c"))
print(len(c))
print(c[0]["func_name"], c[0]["opt"])
print(c[0]["func"])
ex = c[0]
print("dep length:", len(ex["func_dep"]) if ex["func_dep"] else 0)
print(ex["func_dep"][:500] if ex["func_dep"] else "NO func_dep")

75610
mi_cmp_dynamic_unique O0
int _mi_cmp_dynamic_unique(MI_INFO *info, MI_UNIQUEDEF *def,
			   const uchar *record, my_off_t pos)
{
  uchar *rec_buff,*old_record;
  int error;
  DBUG_ENTER("_mi_cmp_dynamic_unique");

  if (!(old_record=my_alloca(info->s->base.reclength)))
    DBUG_RETURN(1);

  /* Don't let the compare destroy blobs that may be in use */
  rec_buff=info->rec_buff;
  if (info->s->base.blobs)
    info->rec_buff=0;
  error=_mi_read_dynamic_record(info,pos,old_record);
  if (!error)
    error=mi_unique_comp(def, record, old_record, def->null_are_equal);
  if (info->s->base.blobs)
  {
    my_free(mi_get_rec_buff_ptr(info, info->rec_buff));
    info->rec_buff=rec_buff;
  }
  my_afree(old_record);
  DBUG_RETURN(error);
}
dep length: 34
eloqsql/storage/myisam/mi_dynrec.c


In [8]:

def looks_like_c(ex):
    dep = (ex.get("func_dep") or "") + "\n" + (ex.get("func") or "")
    # hard reject typical C++ signals
    cpp_tokens = [
        "#include <vector>", "#include <string>", "#include <variant>", "#include <iostream>",
        "using namespace std", "std::", "template<", "class ", "namespace ", "typename",
        "vector<", "string", "variant<", "::"
    ]
    if any(t in dep for t in cpp_tokens):
        return False
    # accept typical C headers
    if re.search(r'#include\s*<\s*(stdio|stdlib|string|stdint|stddef|math)\.h\s*>', dep):
        return True
    # if no includes, still allow if it looks like C (no C++ tokens already)
    return True

cands_c = []
for ex in ds:
    dep = ex.get("func_dep") or ""
    if len(dep) >= 300 and looks_like_c(ex):
        cands_c.append(ex)
    if len(cands_c) >= 20:
        break

print("cands_c:", len(cands_c))
if cands_c:
    e = cands_c[0]
    print("func_name:", e["func_name"], "opt:", e["opt"], "dep_len:", len(e["func_dep"]))
    print("dep head:\n", e["func_dep"][:300])
    print("func head:\n", e["func"][:200])

cands_c: 20
func_name: func0 opt: O0 dep_len: 439
dep head:
 

#include <assert.h>



int binary_search(int arr[], int low, int high, int x) {

    if (high >= low) {

        int mid = (low + high) / 2;

        if ((mid == 0 || x > arr[mid-1]) && arr[mid] == x) {

            return mid;

        } else if (x > arr[mid]) {

            return binary_search(
func head:
 bool func0(int arr[], int n, int x) {

    int i = binary_search(arr, 0, n-1, x);

    if (i == -1) {

        return false;

    }

    if ((i + n/2) <= (n -1) && arr[i + n/2] == x) {

        return


In [10]:
e = cands_c[0]

open("dep.c", "w").write(e["func_dep"].strip() + "\n")
open("func.c", "w").write(e["func"].strip() + "\n")
open("wrapper.c", "w").write(
    "#include <stdbool.h>\n"  # fix for bool/true/false
    "#include <stdint.h>\n"
    "#include <stddef.h>\n"
    "#include <stdio.h>\n"
    "\n"
    + e["func_dep"].strip() + "\n\n"
    + e["func"].strip() + "\n"
)
print("wrote dep.c func.c wrapper.c")

wrote dep.c func.c wrapper.c
