In [15]:
import json
import requests
import os
from tqdm import tqdm
import evaluate

In [16]:
prompt_template = """You are a software engineer developer and need to resolve the following issue, give me the solved code between <code></code> and create a main function between <main></main> to test the code:
{0}
In this file:
{1}
{2}
"""

additional_context = """Also you are provided of additional files that can serve as context:\n"""

In [None]:
folder_path = './data/validation_data'

dataset = []

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                
                issue_description = data["input"]["issue"]["issue_description"]
                
                filename = data["input"]["buggy_file"]["buggy_filename"].split("/")[-1]
                buggy_code = data["input"]["buggy_file"]["buggy_code"]
                
                output_summary =  data["output"]["changes_summary"]
                output_code = data["output"]["fixed_file"]["fixed_code"]
                
                input_data = prompt_template.format(issue_description, filename, buggy_code)
                
                context_files = ""
                if len(data['input']['context']) > 0:
                    context_files = f"{data['input']['context'][0]['filename']}\n{data['input']['context'][0]['code']}"
                    additional_context += context_files
                    input_data += f"\n{additional_context}"
                    
                output = output_code

                dataset.append((input_data, output))
            except json.JSONDecodeError as e:
                print(f"Error reading {filename}: {e}")


In [18]:
model_inputs = [sample[0] for sample in dataset]

In [19]:
with open("./data/model_eval_inputs.json", "w", encoding="utf-8") as f:
    json.dump(model_inputs, f, indent=2)

In [20]:
dataset[0][0]

'You are a software engineer developer and need to resolve the following issue, give me the solved code between <code></code> and create a main function between <main></main> to test the code:\nTo achieve the possibility of changing string formatter implementation or bug fixes without breaking the ABI it\'s important to hide that ABI. All other formatters/sinks have already been written in that way.\n\nNote, that this change will break `blackhole::formatter::string_t` API/ABI as it were described in https://github.com/3Hren/blackhole/milestones, but for now it\'s okay, because nobody uses it explicitly except the Blackhole internals.\n\nIn this file:\nstring.cpp\n#include "blackhole/formatter/string.hpp"\n\n#include <array>\n\n#include <boost/variant/apply_visitor.hpp>\n#include <boost/variant/get.hpp>\n#include <boost/variant/variant.hpp>\n\n#include "blackhole/attribute.hpp"\n#include "blackhole/config/node.hpp"\n#include "blackhole/config/option.hpp"\n#include "blackhole/extensions/

In [31]:
def call_model(prompt: str) -> str:
    response = requests.post(
    url="http://localhost:11434/api/generate",
    # headers={
    #     "Authorization": "Bearer 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",
    #     "Content-Type": "application/json",
    # },
    data=json.dumps({
        "model": "qwen3:8b",
        "prompt": prompt,
        "stream": False
    })
    )
    
    return json.loads(response.text)['response']

In [32]:
print(call_model("Say a short sentence of a fact of space"))

<think>
Okay, the user wants a short sentence about a space fact. Let me think of something interesting but concise. Maybe something about the universe's expansion? Or the number of stars? Wait, the fact about the universe expanding at a faster rate than expected? No, that's a bit complex. How about the number of stars in the observable universe? That's a common one. "There are approximately 10^22 stars in the observable universe." Wait, is that accurate? Let me check. I remember that estimates vary, but 10^22 is often cited. Alternatively, the fact that the Sun is a medium-sized star. Or the Moon's gravitational pull causing tides. But the user might want something more impactful. The expansion of the universe accelerating due to dark energy? That's a key point. But maybe too technical. Let me go with the stars in the observable universe. Yeah, that's a solid fact and concise. Let me make sure the numbers are correct. Yes, 10^22 is a commonly accepted estimate. Alright, that should wo

In [33]:
model_outputs = []

for index, sample in enumerate(tqdm(dataset)):
    output = call_model(sample[0])
    model_outputs.append(output)
    

 45%|████▌     | 9/20 [39:57<48:50, 266.39s/it]


KeyboardInterrupt: 

In [None]:
with open("./data/eval_outputs.json", "w", encoding="utf-8") as f:
    json.dump(model_outputs, f, indent=2)

In [None]:
expected_outputs = [output for _, output in dataset]


In [None]:
import re


cleaned_model_output = [[re.sub(r'</?code>', '', text)] for text in model_outputs]

In [None]:
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

In [None]:
pass_metric = evaluate.load("code_eval")
results = pass_metric.compute(references=expected_outputs, predictions=cleaned_model_output, k=[1])
print(results)

({'pass@1': np.float64(0.0)}, defaultdict(<class 'list'>, {3: [(0, {'task_id': 3, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 1)', 'completion_id': 0})], 2: [(0, {'task_id': 2, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 4)', 'completion_id': 0})], 1: [(0, {'task_id': 1, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 1)', 'completion_id': 0})], 0: [(0, {'task_id': 0, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 21)', 'completion_id': 0})], 4: [(0, {'task_id': 4, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 5)', 'completion_id': 0})], 6: [(0, {'task_id': 6, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 4)', 'completion_id': 0})], 5: [(0, {'task_id': 5, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 4)', 'completion_id': 0})], 7: [(0, {'task_id': 7, 'passed': False, 'result': 'failed: invalid syntax (<string>, line 2)', 'completion_id': 0}

# Hand Made Pass@K

In [None]:
openAI_output = """
namespace blackhole {
inline namespace v1 {
namespace formatter {

class string_t::impl_t {
public:
    std::string pattern;
    severity_map sevmap;
    std::vector<token_t> tokens;

    impl_t(std::string pattern, severity_map sevmap, const options_t& options) :
        pattern(std::move(pattern)),
        sevmap(std::move(sevmap)),
        tokens(tokenize(this->pattern, options))
    {}
};

string_t::string_t(std::string pattern, const options_t& options) :
    pimpl(new impl_t(std::move(pattern), [](int severity, const std::string& spec, writer_t& writer) {
        writer.write(spec, severity);
    }, options))
{}

string_t::string_t(std::string pattern, severity_map sevmap, const options_t& options) :
    pimpl(new impl_t(std::move(pattern), std::move(sevmap), options))
{}

string_t::string_t(string_t&& other) noexcept = default;

string_t::~string_t() = default;

auto string_t::format(const record_t& record, writer_t& writer) -> void {
    const visitor_t visitor(writer, record, pimpl->sevmap);
    for (const auto& token : pimpl->tokens) {
        boost::apply_visitor(visitor, *token);
    }
}

}  // namespace formatter

auto factory<formatter::string_t>::type() -> const char* {
    return "string";
}

auto factory<formatter::string_t>::from(const config::node_t& config) -> formatter::string_t {
    auto pattern = config["pattern"].to_string().get();

    if (auto mapping = config["sevmap"]) {
        std::vector<std::string> sevmap;
        mapping.each([&](const config::node_t& config) {
            sevmap.emplace_back(config.to_string());
        });

        auto fn = [=](std::size_t severity, const std::string& spec, writer_t& writer) {
            if (severity < sevmap.size()) {
                writer.write(spec, sevmap[severity]);
            } else {
                writer.write(spec, severity);
            }
        };

        return formatter::string_t(std::move(pattern), std::move(fn));
    }

    return formatter::string_t(std::move(pattern));
}

}  // namespace v1
}  // namespace blackhole
"""

out_model_output = """
#include <string>
#include <vector>
#include <iostream>
#include <map>

namespace blackhole {
namespace formatter {

struct writer_t {
    std::ostream& inner;

    writer_t(std::ostream& os) : inner(os) {}

    void write(const std::string& spec, const std::string& value) {
        inner << "[" << spec << "]" << value;
    }

    void write(const std::string& spec, int value) {
        inner << "[" << spec << "]" << value;
    }
};

// Simulating token types manually (no variant)
enum TokenType {
    Literal,
    Message,
    ProcessId,
    Severity
};

struct token_t {
    TokenType type;
    std::string value;   // For Literal or specifier
};

class record_t {
public:
    std::string formatted() const { return "formatted message"; }
    int pid() const { return 1234; }
    int severity() const { return 5; }
};

typedef void(*severity_map)(int, const std::string&, writer_t&);

class string_t {
    std::string pattern;
    std::vector<token_t> tokens;
    severity_map sevmap;

public:
    string_t(const std::string& pat)
        : pattern(pat) {
        sevmap = default_sevmap;
        tokens = tokenize(pat);
    }

    string_t(const std::string& pat, severity_map map)
        : pattern(pat), sevmap(map) {
        tokens = tokenize(pat);
    }

    void format(const record_t& record, writer_t& writer) {
        for (size_t i = 0; i < tokens.size(); ++i) {
            const token_t& token = tokens[i];
            switch (token.type) {
                case Literal:
                    writer.inner << token.value;
                    break;
                case Message:
                    writer.write(token.value, record.formatted());
                    break;
                case ProcessId:
                    writer.write(token.value, record.pid());
                    break;
                case Severity:
                    sevmap(record.severity(), token.value, writer);
                    break;
            }
        }
    }

private:
    static void default_sevmap(int severity, const std::string& spec, writer_t& writer) {
        writer.write(spec, severity);
    }

    static std::vector<token_t> tokenize(const std::string& pat) {
        // Simplified example tokenizer that returns fixed tokens
        std::vector<token_t> result;
        result.push_back({Literal, "Process ID: "});
        result.push_back({ProcessId, "pid"});
        result.push_back({Literal, ", Message: "});
        result.push_back({Message, "msg"});
        result.push_back({Literal, ", Severity: "});
        result.push_back({Severity, "sev"});
        return result;
    }
};

} // namespace formatter
} // namespace blackhole
"""

In [None]:
import subprocess
import tempfile
import os

def wrap_with_main(code_body: str, call_line: str, expected_output: str):
    return f"""
#include <iostream>
{code_body}

int main() {{
    {call_line}
    return 0;
}}

// Expected output: {expected_output}
"""

def evaluate_candidate(code_str, expected_output, call_line):
    code_with_main = wrap_with_main(code_str, call_line, expected_output)
    with tempfile.NamedTemporaryFile(suffix=".cpp", mode='w+', delete=False) as f:
        f.write(code_with_main)
        f.flush()
        exe = f.name.replace(".cpp", "")
        result = subprocess.run(["g++", f.name, "-o", exe], capture_output=True)
        if result.returncode != 0:
            print("❌ Compilation failed:", result.stderr.decode())
            return False
        result = subprocess.run([exe], stdout=subprocess.PIPE)
        output = result.stdout.decode()
        print("Program output:", output.strip())
        return expected_output in output

def compute_pass_at_k(candidates, expected_output, call_line, k):
    successes = 0
    for i in range(min(k, len(candidates))):
        if evaluate_candidate(candidates[i], expected_output, call_line):
            successes += 1
            break  # Only care if any pass
    return successes / k

In [None]:
main_call_line = """
blackhole::formatter::record_t rec;
blackhole::formatter::writer_t writer(std::cout);
blackhole::formatter::string_t formatter("dummy");
formatter.format(rec, writer);
"""

expected_output = "[pid]1234"

# Evaluar
outputs = [out_model_output]
pass_at_k = compute_pass_at_k(outputs, expected_output, main_call_line, k=1)
print("✅ pass@1 =", pass_at_k)

Program output: Process ID: [pid]1234, Message: [msg]formatted message, Severity: [sev]5
✅ pass@1 = 1.0
