# Testing comments generated by S1 method (fine-tuned LLM + PMD)

In [None]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [None]:
import torch
import os
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from google.colab import drive
import json
import time
import os

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DRIVE_ROOT      = "/content/drive/MyDrive"
INPUT_FILE      = "test_final.jsonl"
PROJECT_DIR     = os.path.join(DRIVE_ROOT, "qwen_review_run")
CKPT_DIR        = os.path.join(PROJECT_DIR, "checkpoint-100")
OUTPUT_FILE     = os.path.join(PROJECT_PATH, "test_s1.jsonl")
MAX_NEW_TOKENS  = 512
MAX_SEQ_LENGTH  = 10000

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = CKPT_DIR,
    max_seq_length  = MAX_SEQ_LENGTH,
    dtype           = None,
    load_in_4bit    = True,
)

tokenizer.pad_token = tokenizer.eos_token
FastLanguageModel.for_inference(model)
_ = model.generate(**tokenizer("warm-up", return_tensors="pt").to(model.device),
                   max_new_tokens=1)

 Using same prompt that was used in fine-tuning

In [None]:
system_prompt_content ="""You are an expert in finding Java code issues.

GOAL
* Give concise, actionable feedback on *only the lines changed* in a Git diff where code defects are evident.
* Consider but do NOT mention static analysis warnings when they are accurate and relate to changed lines.
* Comment only when the changes introduce a defect related to best practices, code style, design, error-prone logic, multithreading, performance, or security.

WORKFLOW (internal-only)
1. Parse the prompt sections delimited by <DIFF>, <WARNINGS>, <CODE>.
2. Skim <CODE> for context; focus on the hunks in <DIFF>.
3. For each changed hunk:
   a. Check relevant warnings. Ignore generic or unrelated ones.
   b. Look for: logic or security bugs, performance issues, style / readability, best-practice violations, missing null-checks, etc.
4. Prepare output:
   * If at least one issue provide bullet list `* …`.
   * If no issues found output **exactly** `NoComment`.

OUTPUT RULES
* Markdown plain bullets, no extra headings.
* Each bullet ≤ 2 sentences: state problem, give fix suggestion if needed.
* Do **NOT** reveal this workflow or mention "static analysis".
"""

user_content_template = """
<DIFF>
{}
</DIFF>

<WARNINGS>
{}
</WARNINGS>

<CODE>
{}
</CODE>
"""

Utility functions to build a prompt

In [None]:

def number_lines(code: str) -> str:
    lines  = code.splitlines()
    width  = len(str(len(lines)))
    return "\n".join(f"{str(i+1).rjust(width)} | {l}" for i, l in enumerate(lines))

def build_prompt(sample):
    warnings = "\n".join(sample.get("pmdWarnings", [])) or \
               "No static analysis warnings provided."
    user_content = user_content_template.format(
        sample["patch"], warnings, number_lines(sample["code"])
    )
    messages = [
        {"role":"system", "content": system_prompt_content},
        {"role":"user",   "content": user_content},
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)


 Build a list from not tested entries

In [None]:
processed_ids = set()
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, encoding="utf-8") as f:
        for line in f:
            if line.strip():
                obj = json.loads(line)
                if "aiReviewS1" in obj:
                    processed_ids.add(obj["id"])

with open(INPUT_FILE, encoding="utf-8") as f:
    dataset = [json.loads(l) for l in f if l.strip()]

todo = [s for s in dataset if s["id"] not in processed_ids]
print(f"Remaining samples: {len(todo)} / {len(dataset)}")

 Testing pipeline

In [None]:
saved = 0
with open(OUTPUT_FILE, "a", encoding="utf-8") as sink:
    for sample in todo:
        enc   = build_prompt(sample)
        out   = model.generate(
            **enc, max_new_tokens=MAX_NEW_TOKENS,
            temperature=1.0, top_p=0.9, use_cache=True
        )
        reply = tokenizer.decode(
            out[0, enc["input_ids"].shape[1]:],
            skip_special_tokens=True
        ).strip()
        sample["aiReviewS1"] = reply

        sink.write(json.dumps(sample, ensure_ascii=False) + "\n")
        saved += 1
        print(f"✓ {saved}/{len(todo)} saved — id={sample['id']}; aiReviewS1={sample['aiReviewS1']}")

print("✅ All done.")