In [None]:
import os
import re
from pathlib import Path
import json

generated_dir = Path("kfp_eval_samples")
reference_dir = Path("saved_kfp_files")

def normalize_name(name):
    return re.sub(r'[-_\.]|\.py$', '', name.lower())

def create_name_map(folder):
    name_map = {}
    for file in folder.glob("*.py"):
        norm = normalize_name(file.stem)
        name_map[norm] = str(file)
    return name_map

generated_map = create_name_map(generated_dir)
reference_map = create_name_map(reference_dir)

common_keys = set(generated_map.keys()) & set(reference_map.keys())

print(f"✅ Found {len(common_keys)} matching files.")

# Save to JSON for the second script
with open("matched_files.json", "w") as f:
    json.dump([
        {"key": key, "generated": generated_map[key], "reference": reference_map[key]}
        for key in common_keys
    ], f, indent=2)


In [None]:
import json
import re
from pathlib import Path
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Load matched files
with open("matched_files.json", "r") as f:
    matched_files = json.load(f)

def remove_comments_and_docstrings(code):
    code = re.sub(r'(?m)^ *#.*\n?', '', code)
    code = re.sub(r'"""[\s\S]*?"""', '', code)
    code = re.sub(r"'''[\s\S]*?'''", '', code)
    return code

smooth = SmoothingFunction().method1
references = []
hypotheses = []

for item in matched_files:
    with open(item["generated"], "r", encoding="utf-8") as f:
        gen = remove_comments_and_docstrings(f.read())
    with open(item["reference"], "r", encoding="utf-8") as f:
        ref = remove_comments_and_docstrings(f.read())

    gen_tokens = gen.split()
    ref_tokens = ref.split()

    hypotheses.append(gen_tokens)
    references.append([ref_tokens])  # Corpus BLEU expects a list of reference lists

# Compute a single BLEU score
score = corpus_bleu(references, hypotheses, smoothing_function=smooth)
print(f"Overall BLEU score for Qwen-generated KFPs: {round(score, 4)}")
