In [None]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [None]:
import torch
import os
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from google.colab import drive
import json
import time
import os
from typing import List, Dict, Any, Optional

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DRIVE_ROOT   = "/content/drive/MyDrive"
INPUT_FILE   = "test_final.jsonl"
PROJECT_PATH  = os.path.join(DRIVE_ROOT, "CodeReview")
OUTPUT_FILE  = os.path.join(PROJECT_PATH, "test_s3.jsonl")
MODEL_NAME   = "unsloth/Qwen2.5-Coder-7B-Instruct"

MAX_SEQ_LENGTH  = 10000
MAX_NEW_TOKENS  = 512
K_CANDIDATES    = 3
TEMP            = 0.6
CACHE_PATHS     = False

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = MODEL_NAME,
    max_seq_length  = MAX_SEQ_LENGTH,
    dtype           = None,          # autodetect fp16/bf16
    load_in_4bit    = True,          # 4-bit quant for memory savings
)

FastLanguageModel.for_inference(model)                # enable 2× fast path

tokenizer = get_chat_template(
    tokenizer, chat_template="qwen-2.5",
    mapping={"role":"role","content":"content",
             "user":"user","assistant":"assistant"},
)
tokenizer.pad_token = tokenizer.eos_token
_ = model.generate(**tokenizer("warm-up", return_tensors="pt").to(model.device),
                   max_new_tokens=1)

==((====))==  Unsloth 2025.5.6: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

## Prompts used in testing

In [None]:
# System prompt with reasoning
system_prompt_content ="""You are an expert in finding Java code issues.

GOAL:
Review Java diffs for "evident defects" in *changed lines only*.
* "Evident defect": Violates best practices, bug (logic, runtime, security), impairs performance or hurts readability/maintainability.
* Output polite, concise and actionable feedback on defects, or `NoComment`.

WORKFLOW (internal):
1.  Input: Parse <DIFF>, <CODE>. Focus on <DIFF> hunks; use <CODE> for context.
2.  Analysis (changed lines per hunk):
    a.   Find evident defects (logic, security, performance, style, best practices, error handling, concurrency).
3.  Output:
    a.  `<REASONING>`: Explain analysis of changed lines & why issues are/aren't evident defects. Use Markdown lists if needed.
    b.  Final Review: If defects, list as `* Concise problem & fix if not complex.`, else `NoComment`.

OUTPUT RULES:
1.  Structure: `<REASONING>...</REASONING>` block first, then Final Review on a new line.
2.  Final Review Format:
    * Defects: Markdown bullets (`* Problem. Fix.`), ≤2 sentences/bullet. No extra headings.
    * No defects: **Exactly** `NoComment`.
3.  Content Constraints:
    * Feedback on changed lines only.
"""

FEW_SHOT_EXAMPLES: List[Dict[str, str]] = [
    # Scenario 1: No issues found
    dict(
        patch = """@@ -101,5 +101,5 @@ public class Example {
                -       for (int i = 0; i < items.size(); i++) {
                +       for (int index = 0; index < items.size(); index++) {
                        String item = items.get(index);
                        process(item);
                    }""",
        code = """101|   public void processList(List<String> items) {
                  102|     for (int index = 0; index < items.size(); index++) {
                  103|       String item = items.get(index);
                  104|       process(item);
                  105|   }}""",
        review = """<REASONING>
                    * Diff: Loop variable 'i' renamed to 'index'.
                    * Analysis: Change is purely stylistic, improving readability. No logic change, no new defect introduced in changed lines.
                    * Conclusion: No evident defects.
                    </REASONING>
                    NoComment"""),

    # Scenario 2: Blank line in imports (style defect)
    dict(
        patch = """@@ -14,7 +14,7 @@ import org.slf4j.Logger;
                  import org.slf4j.LoggerFactory;

                  import java.io.File;
                  import java.net.URI;
                  -import java.util.concurrent.Semaphore;""",
        code = """14| import org.slf4j.LoggerFactory;
                  15|
                  16| import java.io.File;
                  17| import java.net.URI;
                  18|
                  19|
                  20| public class GitRepository implements Repository {
                  21|   private final static Logger LOGGER = LoggerFactory.getLogger(GitRepository.class);""",
        review = """<REASONING>
                  * Diff: Import removed. Changed lines include the new blank line at line 15.
                  * Analysis: The new blank line at line 15 and line 18 (a changed line) violates common Java import style (readability concern).
                  * Conclusion: Minor style defect.
                  </REASONING>
                  * Line 15 and 18: Imports should not include empty lines as breaks, except between imports and static imports."""),

    # Scenario 3: Wrapper vs. primitive type (best practice/performance)
    dict(
        patch = """@@ -159,6 +159,20 @@ public final class Require {
                        return number;
                      }

                  +   public static double positive(String argName, Double number, String message) {
                  +     if (number == null) {
                  +       throw new IllegalArgumentException(String.format(ARG_MUST_BE_SET, argName));
                  +     }
                  +     if (number <= 0) {
                  +       if (message == null) {
                  +         throw new IllegalArgumentException(argName + " must be greater than 0");
                  +       } else {
                  +         throw new IllegalArgumentException(message);
                  +       }
                  +     }
                  +     return number;
                  +   }
                  +
                      public static int positive(String argName, Integer number) {
                        return positive(argName, number, null);
                      }""",
        code = """172|   public static double positive(String argName, Double number, String message) {
                  173|     if (number == null) {
                  174|       throw new IllegalArgumentException(String.format(ARG_MUST_BE_SET, argName));
                  175|     }
                  176|     if (number <= 0) {
                  177|       if (message == null) {
                  178|         throw new IllegalArgumentException(argName + " must be greater than 0");
                  179|       } else {
                  180|         throw new IllegalArgumentException(message);
                  181|       }
                  182|     }
                  183|     return number; // Unnecessary unboxing
                  184|   }
                  185|
                  186|   public static int positive(String argName, Integer number) {""",
        review = """<REASONING>
                    * Diff: New method `positive(String, Double, String)` added (lines 172-184 are changed).
                    * Analysis:
                        * Parameter `number` (line 172) is `Double`, method returns `double`.
                        * This requires a null check (lines 173-175) and causes unboxing at return (line 183).
                        * Using primitive `double` for parameter `number` avoids null checks and unboxing. This is a best practice violation.
                    * Conclusion: Evident defect (best practice, performance).
                    </REASONING>
                    * Line 172: For parameter `number` consider using primitive type `double`. This avoids unnecessary unboxing at line 183 and makes the null-check redundant.""",
    ),


    # Scenario 4: No relevant warning, LLM finds defect independently (resource leak)
    dict(
        patch = """@@ -20,3 +20,5 @@ import java.io.IOException;
                    public class FileReaderUtil {
                  -     public String readFile(String filePath) throws IOException {
                  -         BufferedReader reader = new BufferedReader(new FileReader(filePath));
                  -         return reader.readLine();
                  +     public String readFile(String filePath) throws IOException {
                  +         BufferedReader reader = new BufferedReader(new FileReader(filePath)); // Resource opened (changed line)
                  +         String line = reader.readLine(); // Changed line
                  +         // Reader not closed
                  +         return line; // Changed line
                  """,
        code = """
         18| import java.io.BufferedReader;
         19| import java.io.FileReader;
         20| import java.io.IOException;
         21|
         22| public class FileReaderUtil {
         23|   public String readFile(String filePath) throws IOException {
         24|     BufferedReader reader = new BufferedReader(new FileReader(filePath));
         25|     String line = reader.readLine();
         26|     return line;
         27|   }
         28| }
        """,
        review = """<REASONING>
* Diff: Method `readFile` modified. Lines 24-26 are new/changed.
* Analysis: `BufferedReader` created at changed line 24 is not closed before method returns (changed line 26). This is a resource leak (best practice violation).
* Conclusion: Evident defect.
</REASONING>
* Line 24: The `BufferedReader` (and underlying `FileReader`) is not closed. Consider using a try-with-resources statement or ensure `close()` is called in a `finally` block to prevent resource leaks.""",
    )
]


# Message builders
USER_BLOCK_TEMPLATE = """
<DIFF>
{patch}
</DIFF>

<CODE>
{code}
</CODE>
"""

## Utility functions

In [None]:
def number_lines(code: str) -> str:
    """Add line number prefix to every line of code"""
    lines = code.splitlines()
    width = len(str(len(lines)))
    return "\n".join(f"{str(i+1).rjust(width)} | {l}" for i, l in enumerate(lines))

def build_chat_messages(sample: Dict[str, Any]) -> list[Dict[str, str]]:
    """Compose few-shot + real example into chat message list."""
    msgs = [{"role":"system", "content": system_prompt_content}]

    for ex in FEW_SHOT_EXAMPLES:
        block = USER_BLOCK_TEMPLATE.format(
            patch    = ex["patch"],
            code     = number_lines(ex["code"]),
        )
        msgs += [
            {"role":"user",      "content": block},
            {"role":"assistant", "content": ex['review']},
        ]

    real_block = USER_BLOCK_TEMPLATE.format(
        patch    = sample["patch"],
        code     = number_lines(sample["code"]),
    )
    msgs.append({"role":"user", "content": real_block})
    return msgs


def parse_reasoning_and_review(full_llm_output: str) -> tuple[str, str]:
    reasoning_content = ""
    review_content = full_llm_output.strip()
    match = re.search(r"<REASONING>(.*?)</REASONING>", full_llm_output, re.DOTALL)
    if match:
        reasoning_content = match.group(1).strip()
        review_content = full_llm_output[match.end(0):].strip()
    return reasoning_content, review_content

Universal self consistency helpers

In [None]:
import re

def sample_candidate(chat_msgs, i=0) -> str:
    #  build prompt text
    prompt = tokenizer.apply_chat_template(
        chat_msgs, tokenize=False, add_generation_prompt=True
    )
    # tokenize to mapping
    enc = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

    out_ids = model.generate(
        **enc,
        max_new_tokens = MAX_NEW_TOKENS,
        temperature    = TEMP,
        top_p          = 0.95,
        use_cache      = True,
    )
    full = tokenizer.decode(out_ids[0, enc["input_ids"].shape[1]:],
                            skip_special_tokens=True)

    reasoning, review  = parse_reasoning_and_review(full)

    print(f"\n--- Candidate {i} ---")
    print(f"Full LLM Output:\n{full}")
    print(f"Parsed Reasoning:\n{reasoning}")
    print(f"Parsed Review:\n{review}")
    print("--------------------")
    return review




def pick_with_usc(candidates: List[str]) -> str:
    bulleted = "\n\n".join(f"[{i+1}]\n{c}" for i, c in enumerate(candidates))
    selector_msgs = [
        {"role":"system",
         "content":"You are an expert reviewer. Choose the most semantically consistent variant below, "
                   "if it is poorly formated or not clear choose next most consistent variant "
                   "and reply ONLY with its number."},
        {"role":"user", "content": bulleted},
    ]

    prompt = tokenizer.apply_chat_template(
        selector_msgs, tokenize=False, add_generation_prompt=True
    )

    enc = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    sel_ids = model.generate(**enc, max_new_tokens=5)
    choice  = tokenizer.decode(
        sel_ids[0, enc["input_ids"].shape[1]:], skip_special_tokens=True
    ).strip()
    try:
        return candidates[int(choice) - 1]
    except (ValueError, IndexError):
        # fallback: majority vote
        return max(set(candidates), key=candidates.count)


## Testing pipeline

In [None]:

# Build a list from not already tested entries
processed: set[str] = set()
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, encoding="utf-8") as fh:
        for line in fh:
            if not line.strip(): continue
            obj = json.loads(line)
            if "aiReviewS2" in obj:
                processed.add(obj["id"])
print(f"Already processed: {len(processed)}")

with open(INPUT_FILE, encoding="utf-8") as fh:
    dataset = [json.loads(l) for l in fh if l.strip()]

todo = [s for s in dataset if s["id"] not in processed]
print(f"Remaining: {len(todo)} / {len(dataset)}\n")




saved = 0
tic   = time.time()

if PROJECT_PATH and not os.path.exists(PROJECT_PATH):
    os.makedirs(PROJECT_PATH)

with open(OUTPUT_FILE, "a", encoding="utf-8") as sink:
    for sample in todo:
        # build chat with exemplars
        chat = build_chat_messages(sample)

        # generate k diverse reasoning paths
        paths = [sample_candidate(chat, i) for i in range(K_CANDIDATES)]

        # USC selection
        best_review = pick_with_usc(paths)
        print(f"BEST: {best_review}")
        sample["aiReviewS2"] = best_review
        if CACHE_PATHS:
            sample["usc_candidates"] = paths

        # save every tested entry
        sink.write(json.dumps(sample, ensure_ascii=False) + "\n")
        saved += 1
        print(f"✓ {saved}/{len(todo)}  id={sample['id']}")

toc = time.time()
print(f"\n✅ Done.  Wrote {saved} samples in {toc-tic:.1f}s")


Already processed: 539
Remaining: 1940 / 2708


--- Candidate 0 ---
Full LLM Output:
<REASONING>
                  * Diff: Removed unused imports for `org.joda.time.format.DateTimeFormat` and `org.joda.time.format.DateTimeFormatter`.
                  * Analysis: The removal of these unused imports is a minor cleanup that improves the project's cleanliness without introducing any bugs or significant changes.
                  * Conclusion: No evident defect.
                  </REASONING>
                  NoComment
Parsed Reasoning:
* Diff: Removed unused imports for `org.joda.time.format.DateTimeFormat` and `org.joda.time.format.DateTimeFormatter`.
                  * Analysis: The removal of these unused imports is a minor cleanup that improves the project's cleanliness without introducing any bugs or significant changes.
                  * Conclusion: No evident defect.
Parsed Review:
NoComment
--------------------

--- Candidate 1 ---
Full LLM Output:
<REASONING>
* Diff: Removed 