In [1]:
from utils import load_config
from llm import *
from prompt import *
from harness import *
import os
original_directory = os.getcwd()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
original_directory

'/home/sagemaker-user/harnessgeneration'

In [3]:
def generate_initial_prompt_by_example(config):
    role = f"You are a developer. You need to write a libFuzzer target for the {config['function_name']} function of your c library. You are provided with an example of how the function {config['function_name']} is used."
    template, custom_lib = extract_includes(config['c_file_path'], config['src_folder_path'])
    fill_c = '''
    <FILL>
    
    int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
    '''
    fill_cpp = '''
    <FILL>
    
    extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
    '''

    template = template +'''
    // Step 2 include custom and standard headers
    '''+ fill_c if {config['is_c_code']} else fill_c_cp
    template = template +f'''
    
    // Step 3 call to the {config['function_name']} function
    <FILL>;
    
    ''' + '''
      return 0;
    }
    '''
    graph='}'

    lib_for_prompt = ''

    code_md = '```c' if {config['is_c_code']} else '```c++'
    
    # for l in config['lib_files']:
    #     with open(l, encoding='utf-8', errors='ignore') as file:
    #         content = f"\n{l}\n{code_md}\n{file.read()}\n```\n"
    #     lib_for_prompt = lib_for_prompt + content
    # for l in custom_lib:
    #     with open(l, encoding='utf-8', errors='ignore') as file:
    #         content = f"\n{l}\n{code_md}\n{file.read()}\n```\n"
    #     lib_for_prompt = lib_for_prompt + content
        # try:
        #     with open(l[:-1]+'c', encoding='utf-8', errors='ignore') as file: #TODO fix for c++
        #         content = f"\n{l[:-1]+'c'}\n{code_md}\n{file.read()}\n```\n"
        #     lib_for_prompt = lib_for_prompt + content
        # except FileNotFoundError:
        #     pass
    with open(config['example'], encoding='utf-8', errors='ignore') as file:
        example = f"\n{code_md}\n{file.read()}\n```\n"
    example = f'''
    ## Example:
    {example}
    ## End of example
    '''

    initial_prompt = f"""
    {role}
    Work step by step:
    1. Analyze the example, find the function {config['function_name']} in the code: which headers and parameters does it need to be properly executed?
    2. Fill the template with necessary headers
    3. fill the template LLVMFuzzerTestOneInput function with a call to {config['function_name']}, such that {config['function_name']} can be properly fuzz
    4. Make sure that you are actually testing {config['function_name']} function.
    Answer should be code only, no explanation.
    
    {example}
    
    ## Start of template:
    {code_md}
    {template}
    ```
    ## End of template:
     """
    initial_prompt = '[INST]' + initial_prompt + '[/INST]'
    return initial_prompt

In [4]:
# function_name="ares_create_query"
# IS_C_CODE= True
# c_files = find_files("/home/sagemaker-user/fuzzer-test-suite/c-ares-CVE-2016-5180/SRC", '.c')
# h_files = find_files("/home/sagemaker-user/fuzzer-test-suite/c-ares-CVE-2016-5180/SRC", '.h')
# lib_files = c_files
# HARNESS = 'harness.c'
# HARNESS_BIN = 'harness'

In [5]:
import subprocess
import time
from datetime import datetime
def has_coverage(config):
    seconds_to_wait = 5
    print(f'checking that there is some coverage (Running the fuzzer for {seconds_to_wait}) seconds')
    # print(datetime.now())
    # r = subprocess.Popen(['afl-fuzz', '-i', 'seed', '-o', 'out', '--', f'./{HARNESS_BIN}'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    # time.sleep(seconds_to_wait)
    # r.terminate()
    # try:
    #     r.wait(timeout=2)  # Give some time to clean up resources
    # except subprocess.TimeoutExpired:
    #     r.kill()  # Force kill if it's still not terminating
    # print(datetime.now())
    print(datetime.now())
    
    cmd = f"timeout {seconds_to_wait}s afl-fuzz -i {config['seeds']} -o out -- ./{config['harness_bin']} > /dev/null"
    get_ipython().system(cmd)
    # !timeout 5s afl-fuzz -i config['seeds'] -o out -- config['harness_bin'] #> /dev/null
    # time.sleep(seconds_to_wait)
    print(datetime.now())
    
    result = subprocess.run(['afl-whatsup', 'out/'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    stats = extract_summary_stats(result.stdout)
    print(f"{stats['coverage reached'][:-1]}")
    if float(stats['coverage reached'][:-1]) == 0:
        print('Trying to fuzz for 60 seconds')
        seconds_to_wait = 60
        cmd = f"timeout {seconds_to_wait}s afl-fuzz -i {config['seeds']} -o out -- ./{config['harness_bin']} > /dev/null"
        get_ipython().system(cmd)
        # !timeout 60s afl-fuzz -i seed -o out -- ./harness > /dev/null
        result = subprocess.run(['afl-whatsup', 'out/'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        stats = extract_summary_stats(result.stdout)
        print(f"{stats['coverage reached'][:-1]}=")
    return not float(stats['coverage reached'][:-1]) == 0

In [6]:
# GENERATE HARNESS
def generate_harness(new_dir, compile_command, CONFIG):
    
    os.chdir(original_directory)
    os.chdir(original_directory + new_dir)
    
    tentatives = 10
    initial_prompt = generate_initial_prompt_by_example(CONFIG)
    first_answer = get_answer(initial_prompt)
    extract_and_save_harness(first_answer, CONFIG['harness'])
    prev_prompt = initial_prompt
    prev_res = first_answer

    while tentatives:
        print(f"{tentatives=}")
        tentatives = tentatives -1
        # checking that harness is compilable
        compile_res = compile(compile_command)
        if compile_res[0]:
        # checking that does not crash on seed
            run_res = run_seed(CONFIG)
            if run_res[0]:
                # checking that there is some coverage
                if has_coverage(CONFIG):
                    break
                else:
                    prev_prompt = followup_prompt(prev_prompt, prev_res[0]['generated_text'], "When running afl-fuzz over the generated code, the Coverage reached value is 0%. It probably means that the input from the fuzzer is not correclty passed to the fuzzed function. Can you regenerate the code to fiX this issue?\n")
                    prev_res = get_answer(prev_prompt)
                    extract_and_save_harness(prev_res, CONFIG['harness'])
            else:
                print(f"{run_res[1]=}")
                prev_prompt = followup_prompt(prev_prompt, prev_res[0]['generated_text'], "The generated code when run with input AAAA is crashing with this error\n" + run_res[1] + '\ncan you regenerate the previous code to fix the issue?')
                prev_res = get_answer(prev_prompt)
                extract_and_save_harness(prev_res, CONFIG['harness'])
        else:
            print(f"{compile_res[1]=}")
            prev_prompt = followup_prompt(prev_prompt, prev_res[0]['generated_text'], "can you regenerate the previous code to fix the following compilation error:\n" + compile_res[1])
            prev_res = get_answer(prev_prompt)
            extract_and_save_harness(prev_res, CONFIG['harness'])
            # break
    print("New harness saved in" + original_directory + '/evaluation/simple_lib/src/' + CONFIG['harness'])
    os.chdir(original_directory)

In [7]:
config_file= "/home/sagemaker-user/harnessgeneration/evaluation/new-libxml2/xmlReadFile.yaml"
CONFIG = load_config(config_file)
new_dir = '/evaluation/new-libxml2/'
compile_command = CONFIG['compile_command'].split()
generate_harness(new_dir, compile_command, CONFIG)

  ```c
    #include <libxml/xmlmemory.h>
    #include <libxml/parser.h>
    
    int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
        xmlReadFile(Data, Size, NULL, NULL, 0);
        return 0;
    }
    ```
tentatives=10
checking that harness is compilable
compile_command=['afl-gcc-fast', './harness_xmlReadFile.c', '-I', 'libxml2/include', '-L', 'libxml2/.libs/', '-lz', '-lm', '-o', 'harness_xmlReadFile']
Compilation failed.
Errors: ./harness_xmlReadFile.c:4:38: error: unknown type name ‘uint8_t’
    4 |     int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
      |                                      ^~~~~~~
./harness_xmlReadFile.c: In function ‘LLVMFuzzerTestOneInput’:
    5 |         xmlReadFile(Data, Size, NULL, NULL, 0);
      |                     ^~~~
      |                     |
      |                     const int *
In file included from ./harness_xmlReadFile.c:2:
libxml2/include/libxml/parser.h:1279:54: note: expected ‘const char *’ but argumen

KeyboardInterrupt: 

In [21]:
CONFIG['compile_command']

'afl-gcc-fast ./harness_xmlReadFile.c -I libxml2/include libxml2/.libs/libxml2.a -lz -lm -o harness_xmlReadFile'

In [None]:
!

In [13]:
# c_files = find_files("/home/sagemaker-user/fuzzer-test-suite/c-ares-CVE-2016-5180/SRC", '.c')
# h_files = find_files("/home/sagemaker-user/fuzzer-test-suite/c-ares-CVE-2016-5180/SRC", '.h')

In [6]:
initial_prompt = generate_initial_prompt(CONFIG)

In [8]:
template, custom_lib = extract_includes(CONFIG['c_file_path'], CONFIG['src_folder_path'])

In [9]:
custom_lib

['/home/sagemaker-user/harnessgeneration/evaluation/libxml2/libxml2/libxml.h',
 '/home/sagemaker-user/harnessgeneration/evaluation/libxml2/libxml2/config.h',
 '/home/sagemaker-user/harnessgeneration/evaluation/libxml2/libxml2/buf.h',
 '/home/sagemaker-user/harnessgeneration/evaluation/libxml2/libxml2/enc.h',
 '/home/sagemaker-user/harnessgeneration/evaluation/libxml2/libxml2/elfgcchack.h']

In [10]:
template

'#include "/libxml.h"/n#include "/config.h"/n#include "/buf.h"/n#include "/enc.h"/n#include "/elfgcchack.h"'