#LLM Authorship Attribution

**Description:** This script verifies that all C code entries in the dataset are compilable by using gcc -c. It checks each code sample for syntax errors, type errors, and translation unit correctness, but does not perform linking. In other words, the script assesses the compilability of the dataset's C code, ensuring each file is valid C source code, even though external references may remain unresolved.

In [1]:
#STEP 0: Install all necessary packages (this process may take some time)
print("[*] Installing dependencies...")
!apt-get install -y  \
    build-essential cmake pkg-config git \
    libeigen3-dev libfftw3-dev libgmp-dev \
    libblas-dev liblapack-dev libopenblas-dev \
    libcurl4-openssl-dev libarchive-dev \
    libsdl2-dev libsdl2-image-dev libsdl1.2-dev \
    libbluetooth-dev libcjson-dev libjansson-dev \
    libjson-c-dev libgd-dev libglib2.0-dev \
    libgsl-dev libusb-1.0-0-dev libudev-dev \
    libxml2-dev libncurses5-dev libncursesw5-dev \
    libpoppler-cpp-dev libtiff-dev libpng-dev \
    libjpeg-dev libxslt1-dev libqrencode-dev \
    libreadline-dev libssl-dev libpcap-dev \
    libpthread-stubs0-dev libhdf5-dev \
    liblzma-dev libzstd-dev zlib1g-dev \
    libtar-dev\
    libxlsxwriter-dev libyaml-dev \
    libopencv-dev \
    libtesseract-dev libleptonica-dev \
    python3-numpy python3-pip \
    wget p7zip-full unzip csvtool 7zip libsqlite3-dev libmysqlclient-dev libpq-dev libportaudio2 portaudio19-dev freeglut3-dev >/dev/null 2>&1
print("[*] Installation: DONE")
# Test the installation with a simple trick (compiles and links with common libs)
!echo "int main() { return 0; }" > test.c && gcc test.c \
-lm -lfftw3 -lsqlite3 -lcrypto -lmysqlclient -lpq -lssl \
-lportaudio -lpcap -lqrencode -lSDL2 -lglut -lGLU -lGL -lcurl \
-lgmp -lblas -llapack -lopenblas -larchive -lbluetooth -lcjson -ljansson \
-ljson-c -lgd -lglib-2.0 -lgsl -lusb-1.0 -ludev -lxml2 -lncurses -lpoppler-cpp \
-ltiff -lpng -ljpeg -lxslt -lreadline -lpthread  -llzma -lzstd -lz \
-lyaml -lopencv_core -lopencv_imgproc -lopencv_highgui -ltesseract -llept \
 -lxlsxwriter \
-o test 2>/dev/null && echo "[*] Compilation Test: SUCCESS" || echo "[*] Compilation Test: FAILED"
!rm test & rm test.c 2>/dev/null
print("[*] Downloading LLM-AuthorBench dataset")
!wget -O LLM-AuthorBench.json.zip https://github.com/LLMauthorbench/LLMauthorbench/raw/main/LLM-AuthorBench.json.zip > /dev/null 2>&1
# Unzip the file
!unzip LLM-AuthorBench.json.zip > /dev/null 2>&1
# Delete the zip file
!rm LLM-AuthorBench.json.zip > /dev/null 2>&1
print("[*] Download: DONE")

[*] Installing dependencies...
[*] Installation: DONE
[*] Compilation Test: SUCCESS
[*] Downloading LLM-AuthorBench dataset
[*] Download: DONE


In [None]:
#STEP 1: Check DATASET compilability rate using gcc -c
import json, os, tempfile,subprocess
from tqdm import tqdm

def check_compilable(c_code: str) -> bool:
    linker_flags = [
        "-lm", "-lfftw3", "-lsqlite3", "-lcrypto", "-lmysqlclient", "-lpq", "-lssl",
        "-lportaudio", "-lpcap", "-lqrencode", "-lSDL2", "-lglut", "-lGLU", "-lGL", "-lcurl",
        "-lgmp", "-lblas", "-llapack", "-lopenblas", "-larchive", "-lbluetooth", "-lcjson", "-ljansson",
        "-ljson-c", "-lgd", "-lglib-2.0", "-lgsl", "-lusb-1.0", "-ludev", "-lxml2", "-lncurses", "-lpoppler-cpp",
        "-ltiff", "-lpng", "-ljpeg", "-lxslt", "-lreadline", "-lpthread", "-llzma", "-lzstd", "-lz",
        "-lyaml", "-lopencv_core", "-lopencv_imgproc", "-lopencv_highgui", "-ltesseract", "-llept",
        "-lxlsxwriter"
    ]

    try:
        with tempfile.NamedTemporaryFile(suffix=".c", mode="w", delete=False) as tmp:
            tmp.write(c_code)
            tmp_path = tmp.name

        output_path = tmp_path + ".out"
        cmd = [
            "gcc","-c", tmp_path, "-o", output_path,
        ] + linker_flags

        result = subprocess.run(
            cmd,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.PIPE
        )
        # if result.returncode != 0:
        #   print("GCC failed with the following error:")
        #   print(result.stderr.decode())



        # Clean up temporary files
        os.remove(tmp_path)
        if os.path.exists(output_path):
            os.remove(output_path)

        return result.returncode == 0

    except Exception:
        return False
def save_compilable_samples(input_path: str, compilable_path: str):
    # Load JSON array
    with open(input_path, 'r') as file:
        data = json.load(file)

    total = len(data)
    compilable_count = 0
    first_compilable = True
    first_non_compilable = True

    # For model statistics
    model_counts = {}
    model_compilable = {}

    with open(compilable_path, 'w') as out_f, open("NON-COMPILABLE_dataset.json", 'w') as out_nf:
        out_f.write('[')
        out_nf.write('[')

        with tqdm(total=total, dynamic_ncols=True) as pbar:
            for i, sample in enumerate(data):
                model_name = sample.get("model_name", "unknown")
                model_counts[model_name] = model_counts.get(model_name, 0) + 1

                c_code = sample.get("c_code", "")
                compiles = check_compilable(c_code)

                to_save = {
                    "model_name": sample.get("model_name", ""),
                    "prompt": sample.get("prompt", ""),
                    "c_code": c_code,
                }

                if compiles:
                    model_compilable[model_name] = model_compilable.get(model_name, 0) + 1
                    if not first_compilable:
                        out_f.write(',\n')
                    out_f.write(json.dumps(to_save, ensure_ascii=False, indent=2))
                    first_compilable = False
                    compilable_count += 1
                else:
                    if not first_non_compilable:
                        out_nf.write(',\n')
                    out_nf.write(json.dumps(to_save, ensure_ascii=False, indent=2))
                    first_non_compilable = False

                percent = 100 * compilable_count / (i + 1)
                pbar.set_description(f"Compiling ({percent:.2f}% OK)")
                pbar.update(1)

        out_f.write(']\n')
        out_nf.write(']\n')

    print(f"\n[+] Total: {compilable_count}/{total} ({100*compilable_count/total:.2f}%) samples compilable and saved to {compilable_path}")

    print("\n[+] Compile ratio per model:")
    for model in sorted(model_counts):
        count = model_counts[model]
        ok = model_compilable.get(model, 0)
        ratio = 100 * ok / count
        print(f"    {model}: {ok}/{count} ({ratio:.2f}%)")

if __name__ == "__main__":
    save_compilable_samples("LLM-AuthorBench.json", "COMPILABLE_dataset.json")
