In [None]:
!ls -la
!pip install -qqq cohere

In [None]:
## IMPORTS
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import cohere
import json
import os
import re
import csv

In [None]:

folder_path = "data/code_summaries"
output_dir = "results"
judge_model_name = "cohere"

os.makedirs(output_dir, exist_ok=True)

error_log = []

co = cohere.ClientV2("Z8VuQPeTvunJmHqe5lN65HaEES0BycC9nkCZ6OPJ")


In [None]:
# Query function
def query_cohere(prompt):
    response = co.chat(
        model="command-a-03-2025",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.message.content[0].text.strip()


def extract_json_block(text):
    brace_stack = []
    json_start = None
    for i, c in enumerate(text):
        if c == '{':
            if not brace_stack:
                json_start = i
            brace_stack.append(c)
        elif c == '}':
            if brace_stack:
                brace_stack.pop()
                if not brace_stack and json_start is not None:
                    return text[json_start:i+1]
    return None

def safe_json_extract(text):
    try:
        text = text.strip()
        text = re.sub(r"^```(?:json)?", "", text).strip()
        text = re.sub(r"```$", "", text).strip()

        json_str = extract_json_block(text)
        if json_str:
            return json.loads(json_str)
        else:
            print("[Extraction Failed] No JSON block found.")
    except json.JSONDecodeError as e:
        print(f"[JSONDecodeError] {e}")
    except Exception as e:
        print(f"[Unexpected Error] {e}")
    return None



# Clean markdown-style code fences
def clean_response_text(text):
    text = text.strip()

    # Remove markdown code fences
    if text.startswith("```json") or text.startswith("```"):
        text = re.sub(r"^```(?:json)?", "", text).strip()
        text = re.sub(r"```$", "", text).strip()

    # Extract first JSON object using regex
    match = re.search(r"{.*?}", text, re.DOTALL)
    if match:
        return match.group(0)
    return text

In [None]:
# Prompt: compare English vs non-English
def build_comparison_prompt(code, eng_summary, target_summary, target_language):
    return f"""
You are a multilingual software expert evaluating two summaries of the same code snippet:

    One is in English (reference).
    One is in {target_language}.

--- CODE ---
{code}

--- ENGLISH SUMMARY ---
{eng_summary}

--- {target_language.upper()} SUMMARY ---
{target_summary}

Evaluate both summaries on:

    Accuracy
    Completeness
    Terminology Fidelity
    Language Quality

INSTRUCTIONS:

    Give each a score from 1 to 5 per criterion
    Calculate overall scores (1–5)

⚠️ RESPONSE RULES (STRICT):

    Respond ONLY with a valid JSON object
    DO NOT add explanations, markdown, or comments
    Response MUST start and end with curly braces

EXAMPLE FORMAT:
{{
  "score_english": {{
    "accuracy": 5,
    "completeness": 5,
    "terminology": 4,
    "language_quality": 5,
    "overall_score": 5
  }},
  "score_non_english": {{
    "accuracy": 4,
    "completeness": 4,
    "terminology": 3,
    "language_quality": 4,
    "overall_score": 4
  }}
}}

REPEAT: Only respond with the JSON shown above.
"""



# Prompt: evaluate only non-English summary
def build_single_summary_prompt(code, target_summary, target_language):
    return f"""
You are a multilingual software expert evaluating a summary written in {target_language} for the following code.

--- CODE ---
{code}

--- SUMMARY ({target_language}) ---
{target_summary}

Evaluate the summary on:

    Accuracy
    Completeness
    Terminology Fidelity
    Language Quality

⚠️ RESPONSE RULES (STRICT):

    Respond ONLY with a JSON object (no explanation)
    Start and end with curly braces

EXAMPLE FORMAT:
{{
  "accuracy": 4,
  "completeness": 5,
  "terminology": 4,
  "language_quality": 5,
  "overall_score": 4
}}

REPEAT: No text or markdown. Only the JSON.
"""




In [None]:
# ─── Evaluator ───────────────────────────────────────────────────
def evaluate_entry(task):
    entry, model_folder, filename, code_lang, idx, key, target_language, model_fn = task
    results = {"comparison": None, "single": None}
    code = entry.get("code", "")
    english_summary = entry.get("summary_english", "")
    target_summary = entry[key]
    sample_id = entry.get("id", f"{code_lang}_{idx}")
    model_name = entry.get("model_name", model_folder)

    try:
        prompt_comp = build_comparison_prompt(code, english_summary, target_summary, target_language)
        response_comp = model_fn(prompt_comp)
        parsed_comp = safe_json_extract(response_comp)
        if not parsed_comp:
          raise RuntimeError(f"[Parsing Failed] Malformed JSON for sample {sample_id}")

        results["comparison"] = {
            "sample_id": sample_id,
            "model_folder_name": model_folder,
            "model_name": model_name,
            "programming_language": code_lang,
            "language": target_language,
            "reference_summary": english_summary,
            "generated_summary": target_summary,
            "judge_model": judge_model_name,
            "llm_eng_accuracy": parsed_comp["score_english"]["accuracy"],
            "llm_eng_completeness": parsed_comp["score_english"]["completeness"],
            "llm_eng_terminology": parsed_comp["score_english"]["terminology"],
            "llm_eng_language_quality": parsed_comp["score_english"]["language_quality"],
            "llm_eng_overall_score": parsed_comp["score_english"]["overall_score"],
            "llm_mt_accuracy": parsed_comp["score_non_english"]["accuracy"],
            "llm_mt_completeness": parsed_comp["score_non_english"]["completeness"],
            "llm_mt_terminology": parsed_comp["score_non_english"]["terminology"],
            "llm_mt_language_quality": parsed_comp["score_non_english"]["language_quality"],
            "llm_mt_overall_score": parsed_comp["score_non_english"]["overall_score"]
        }
    except Exception as e:
        error_log.append(f"[Comparison ERROR] sample_id={sample_id} lang={target_language} → {str(e)}\nRAW_RESPONSE:\n{repr(response_comp[:300])}")


    try:
        prompt_single = build_single_summary_prompt(code, target_summary, target_language)
        response_single = model_fn(prompt_single)
        parsed_single = safe_json_extract(response_single)
        if not parsed_single:
          raise RuntimeError(f"[Parsing Failed] Malformed JSON for sample {sample_id}")

        results["single"] = {
            "sample_id": sample_id,
            "model_folder_name": model_folder,
            "model_name": model_name,
            "programming_language": code_lang,
            "language": target_language,
            "reference_summary": english_summary,
            "generated_summary": target_summary,
            "judge_model": judge_model_name,
            "llm_single_accuracy": parsed_single["accuracy"],
            "llm_single_completeness": parsed_single["completeness"],
            "llm_single_terminology": parsed_single["terminology"],
            "llm_single_language_quality": parsed_single["language_quality"],
            "llm_single_overall_score": parsed_single["overall_score"]
        }
    except Exception as e:
        error_log.append(f"[Single ERROR] sample_id={sample_id} lang={target_language} → {str(e)}\nRAW_RESPONSE:\n{repr(response_single[:300])}")



    return results

# ─── Task Collection ─────────────────────────────────────────────
tasks = []
for model_folder in os.listdir(folder_path):
    model_path = os.path.join(folder_path, model_folder)
    if not os.path.isdir(model_path):
        continue

    for filename in os.listdir(model_path):
        if not filename.endswith(".json") or "all_languages_combined" in filename:
            continue

        file_path = os.path.join(model_path, filename)
        code_lang = filename.split('_')[0]

        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for idx, entry in enumerate(data):
            if not entry.get("code") or not entry.get("summary_english"):
                sample_id = entry.get("id", f"{code_lang}_{idx}")
                error_log.append(f"[Skipped] Missing code or English summary → {sample_id}")
                continue

            for key in entry:
                if key.startswith("summary_") and key != "summary_english":
                    target_language = key.replace("summary_", "")
                    tasks.append((entry, model_folder, filename, code_lang, idx, key, target_language, query_cohere))


In [None]:

# ─── Evaluation Execution ────────────────────────────────────────

comparison_results = {judge_model_name: []}
single_results = {judge_model_name: []}

print(f"\n🚀 Starting evaluation of {len(tasks)} entries using 30 threads...")

with ThreadPoolExecutor(max_workers=50) as executor:
    futures = [executor.submit(evaluate_entry, task) for task in tasks]

    for future in tqdm(as_completed(futures), total=len(tasks), desc="Evaluating", ncols=100):
        try:
            result = future.result()
            if result["comparison"]:
                comparison_results[judge_model_name].append(result["comparison"])
            if result["single"]:
                single_results[judge_model_name].append(result["single"])
        except Exception as e:
            error_log.append(f"[Unhandled Future Exception] → {str(e)}")


# ─── Save CSV Results ────────────────────────────────────────────
def save_csv(path, rows):
    if rows:
        with open(path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=rows[0].keys())
            writer.writeheader()
            writer.writerows(rows)
        print(f"Saved: {path}")

save_csv(os.path.join(output_dir, f"comparison_{judge_model_name}.csv"), comparison_results[judge_model_name])
save_csv(os.path.join(output_dir, f"single_{judge_model_name}.csv"), single_results[judge_model_name])

# ─── Save Error Log ──────────────────────────────────────────────
if error_log:
    error_log_path = os.path.join(output_dir, f"llm_judge_errors_{judge_model_name}.log")
    with open(error_log_path, "w", encoding="utf-8") as f:
        for line in error_log:
            f.write(line + "\n")
    print(f"Errors encountered. Logged to: {error_log_path}")
else:
    print("No errors logged.")

In [None]:
with open("results/comparison_cohere.csv") as f:

    print(f.read())
