## Code review comment augmentation for training and validation datasets

In [8]:
from google.colab import userdata
import concurrent.futures
import json
import os
import requests
from typing import Dict, List, Optional

DEEPSEEK_API_KEY = userdata.get('DEEPSEEK_API_KEY')
DEEPSEEK_API_URL = "https://api.deepseek.com/chat/completions"
DEEPSEEK_MODEL_ID = "deepseek-chat"

DATA_PATH   = "train-java-with-code-with-pmd.json" # or valid-java-with-code-with-pmd.json.
OUTPUT_PATH = "train_final.jsonl" # or valid_final.jsonl

Teacher prompt to augment and filter real code review comments

In [None]:
def construct_teacher_prompt_messages(patch, real_human_review, pmd_warnings, patched_code):
    system_prompt_teacher = """
    You are a Java Code-Review Assistant. You’ll be given:

    1.  A Code Patch (Diff)
    2.  An optional Review Comment (from a human reviewer)
    3.  Optional Static Analysis Warnings
    4.  The Final Patched Code

    **Core Objective:** Your primary role is to identify and help address **actual code defects**. A code defect is an issue that negatively impacts code correctness, robustness, performance, security, or represents a significant deviation from essential programming best practices. Focus exclusively on these defects.

    **Guidelines for All Comments & Analysis:**
    * **Defect-Centric:** All feedback must target a specific code defect. Do not comment on purely stylistic preferences or generic observations.
    * **Static Analysis Warnings:**
        * A warning is relevant *only if* it directly pertains to changed lines AND clearly indicates a **code defect** as defined above.
        * Always ignore warnings that are generic, false positives, or do not highlight a tangible defect.
        * When referencing a valid warning's substance, omit tool or rule names.
    * **Output Standards:** Comments must be polite, professional, precise, actionable, and formatted in Markdown.

    **Evaluation Process:**

    **A. If a Human Review Comment IS PROVIDED:**

    1.  **Classify the Human Comment:**
        * If it is clearly from the patch author or is an abstract question not highlighting a defect: Output `NotRelevant` and stop.
        * If it appears to be genuine reviewer feedback about a potential defect: Proceed to step A.2.

    2.  **Rewrite the Human Comment:**
        * Refine the comment according to the **Output Standards** and **Defect-Centric** guideline.
        * If the human comment mentions or could be supported by a static analysis warning, ensure this warning meets all criteria under **Guidelines for Static Analysis Warnings** before incorporating its essence.
        * Output *only* your rewritten Markdown comment.

    **B. If NO Human Review Comment IS PROVIDED:**

    1.  **Conduct Your Own Review:**
        * Examine the code patch, final patched code, and any static analysis warnings.
        * Identify any **code defects** based on the **Core Objective** and **Guidelines for Static Analysis Warnings**.

    2.  **Formulate Your Comment:**
        * If you identify one or more actionable **code defects**: Write a single, consolidated professional Markdown review comment. You may use bullet points if addressing multiple related defects within this single comment.
        * If no **code defects** are identified: Output `NoComment`.

    **Allowed Outputs (Strictly one of the following for any given task):**
    * `NotRelevant`
    * `NoComment`
    * A single Markdown-formatted review comment (no extra headings or explanations beyond the comment itself).

    Do not output anything else.
    """

    user_prompt_content = f"""Please generate an "Enhanced Review Comment" based on the following inputs:
      ## Code Diff
      {patch}

      ## Original Human Review
      {real_human_review}

      ## PMD Warnings
      {pmd_warnings}

      ## Patched Code
      {patched_code}

      Response:
      """
    messages = [
        {"role": "system", "content": system_prompt_teacher},
        {"role": "user", "content": user_prompt_content}
    ]
    return messages

API helper

In [4]:
def call_deepseek_api_sync(
    messages: List[Dict[str, str]],
    model_id: str = DEEPSEEK_MODEL_ID,
    api_key: str = DEEPSEEK_API_KEY,
    api_url: str = DEEPSEEK_API_URL,
    timeout: int = 180,
) -> Optional[Dict]:
    """Synchronous DeepSeek completion."""
    if not api_key:
        raise ValueError("DEEPSEEK_API_KEY is not set.")

    headers = {"Content-Type": "application/json",
               "Authorization": f"Bearer {api_key}"}

    payload = {
        "model": model_id,
        "messages": messages,
        "max_tokens": 1500,
        "temperature": 0.3,
        "top_p": 0.95,
    }

    try:
        response = requests.post(api_url, headers=headers, json=payload, timeout=timeout)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.Timeout:
        print(f"Timeout after {timeout}s.")
    except requests.exceptions.RequestException as err:
        print(f"Request error: {err}")
    return None

Utility functions

In [5]:
def number_lines(code: str) -> str:
    """Return code with 1-based line numbers prefixed."""
    lines = code.splitlines()
    width = len(str(len(lines)))
    return "\n".join(f"{str(i+1).rjust(width)} | {line}" for i, line in enumerate(lines))

def print_samples(samples: List[Dict], start: int = 0, end: int = 2) -> None:
    """Pretty-print a slice of sample entries."""
    for idx in range(start, end + 1):
        s = samples[idx]
        print(f"\n— Sample {idx+1} (ID: {s.get('id')}) —")
        print("Patch:\n", s.get("patch", "").strip())
        print("\nPatched Code:\n", s.get("code", "").strip())
        print("\nPMD Warnings:\n", s.get("pmdWarnings", ""))
        print("\nReal Review:\n", s.get("realReview", "").strip())

JSONL helpers

In [6]:
def load_existing_results(path: str) -> Dict[str, Dict]:
    if not os.path.exists(path):
        return {}
    with open(path, encoding="utf-8") as f:
        return {json.loads(line)["id"]: json.loads(line) for line in f if line.strip()}

def save_result_to_jsonl(path: str, record: Dict) -> None:
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

In [9]:
with open(DATA_PATH, "r", encoding="utf-8") as f:
    lines = f.readlines()

parsed_samples = [json.loads(line) for line in lines]

new_samples = []
for i, entry in enumerate(parsed_samples[:3]):
    new_sample = {
        "id": entry.get("id"),
        "patch": entry.get("patch"),
        "code": entry.get("code"),
        "pmdWarnings": entry.get("pmdWarnings"),
        "realReview": entry.get("realReview")
    }
    new_samples.append(new_sample)

In [None]:
print_samples(new_samples)

Sample processing

In [None]:
def process_sample(sample: Dict) -> Optional[Dict]:
    real_review   = sample["realReview"].strip() or "Comment not provided"
    pmd_warnings  = ("\n".join(sample["pmdWarnings"])
                     if isinstance(sample["pmdWarnings"], list)
                     else sample["pmdWarnings"]).strip() or "No warnings"

    messages = construct_teacher_prompt_messages(
        sample["patch"],
        real_review,
        pmd_warnings,
        number_lines(sample["code"]),
    )

    response = call_deepseek_api_sync(messages)
    if not response:
        return None

    review = (response.get("choices", [{}])[0]
              .get("message", {})
              .get("content", "")
              .strip())

    sample["teacherReview"] = review or "No review generated"
    return sample

Main pipeline

In [None]:
# 1. Load dataset
with open(DATA_PATH, encoding="utf-8") as f:
    parsed_samples = [json.loads(line) for line in f]

samples = [
    {
        "id": s.get("id"),
        "patch": s.get("patch"),
        "code": s.get("code"),
        "pmdWarnings": s.get("pmdWarnings"),
        "realReview": s.get("realReview"),
    }
    for s in parsed_samples
]

# 2. Skip already processed IDs
existing = load_existing_results(OUTPUT_PATH)
todo     = [s for s in samples if s["id"] not in existing]

print(f"Unprocessed samples: {len(todo)}")

# 3. Run enrichment in parallel
processed = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
    futures = {pool.submit(process_sample, s): s["id"] for s in todo}
    for fut in concurrent.futures.as_completed(futures):
        result = fut.result()
        if result and "teacherReview" in result:
            save_result_to_jsonl(OUTPUT_PATH, result)
            processed += 1
            print(f"✓ [{processed}/{len(todo)}] Saved: {result['id']}")
        else:
            print(f"✗ Failed: {futures[fut]}")