# Fine-tuning qwen-2.5-Coder Instruct with commit code and augmented comments (real + PMD)

In [None]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

In [None]:
import torch
import os
from unsloth import FastLanguageModel
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from google.colab import drive

In [None]:
drive.mount('/content/drive')
drive_root = "/content/drive/MyDrive"
project_dir = os.path.join(drive_root, "qwen_review_run")
best_dir = os.path.join(project_dir, "best_model")

max_seq_length = 10000
dtype = None
load_in_4bit = True # 4bit quantization to reduce memory usage.

Continue with model from last checkpoint

In [None]:

# ckpt_dir  = os.path.join(project_dir, "checkpoint-300")
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = ckpt_dir,
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit
# )

Optimized model from unsloth for efficient fine-tuning

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Coder-7B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Prompts used while training

In [None]:
system_prompt_content ="""You are an expert in finding Java code issues.

GOAL
* Give concise, actionable feedback on *only the lines changed* in a Git diff where code defects are evident.
* Consider but do NOT mention static analysis warnings when they are accurate and relate to changed lines.
* Comment only when the changes introduce a defect related to best practices, code style, design, error-prone logic, multithreading, performance, or security.

WORKFLOW (internal-only)
1. Parse the prompt sections delimited by <DIFF>, <WARNINGS>, <CODE>.
2. Skim <CODE> for context; focus on the hunks in <DIFF>.
3. For each changed hunk:
   a. Check relevant warnings. Ignore generic or unrelated ones.
   b. Look for: logic or security bugs, performance issues, style / readability, best-practice violations, missing null-checks, etc.
4. Prepare output:
   * If at least one issue provide bullet list `* …`.
   * If no issues found output **exactly** `NoComment`.

OUTPUT RULES
* Markdown plain bullets, no extra headings.
* Each bullet ≤ 2 sentences: state problem, give fix suggestion if needed.
* Do **NOT** reveal this workflow or mention "static analysis".
"""

user_content_template = """
<DIFF>
{}
</DIFF>

<WARNINGS>
{}
</WARNINGS>

<CODE>
{}
</CODE>
"""

Utility function to add line prefix to code lines

In [None]:
def number_lines(code: str) -> str:
    lines = code.splitlines()
    width = len(str(len(lines)))
    numbered = [
        f"{str(i+1).rjust(width)} | {line}"
        for i, line in enumerate(lines)
    ]
    return "\n".join(numbered)

 Prompt builder

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen-2.5",
    mapping={"role": "role", "content": "content", "user": "user", "assistant": "assistant"},
)


def formatting_prompts_func(examples):
    patches = examples["patch"]
    codes = [number_lines(code) for code in examples["code"]]
    all_pmd_warnings = examples["pmdWarnings"]
    real_reviews = examples["teacherReview"] # Part to be learned

    formatted_texts = []
    for patch, code, pmd_list, review_text in zip(patches, codes, all_pmd_warnings, real_reviews):
        if isinstance(pmd_list, list):
            formatted_pmd = "\n".join(pmd_list) if pmd_list else "No static analysis warnings provided."
        else:
            formatted_pmd = str(pmd_list) if pmd_list else "No static analysis warnings provided."

        user_content = user_content_template.format(patch, formatted_pmd, code )

        messages = [
            {"role": "system", "content": system_prompt_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": review_text}
        ]

        formatted_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        formatted_texts.append(formatted_text)

    return {"text": formatted_texts}

In [None]:
from datasets import load_dataset

train_dataset_path = "train_final.jsonl"
valid_dataset_path = "valid_final.jsonl"

try:
    train_dataset = load_dataset("json", data_files=train_dataset_path, split="train")
    valid_dataset = load_dataset("json", data_files=valid_dataset_path, split="train")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
valid_dataset = valid_dataset.map(formatting_prompts_func, batched=True)

# Teacher LLM returned "NotRelevant" when real comment did not describe any code defects
# Filter out entries with teacherComment == "NotRelevant."
train_dataset = train_dataset.filter(lambda example: example["teacherReview"] != "NotRelevant")
valid_dataset = valid_dataset.filter(lambda example: example["teacherReview"] != "NotRelevant")

# Teacher LLM returned "NoComment" when real comment was empty
def replace_no_comment(example):
    if "NoComment" in example["teacherReview"]:
        example["teacherReview"] = ""
    return example

train_dataset = train_dataset.map(replace_no_comment)
valid_dataset = valid_dataset.map(replace_no_comment)

train_dataset = train_dataset.shuffle(seed=42)
valid_dataset = valid_dataset.shuffle(seed=42)

valid_dataset = valid_dataset.select(range(min(400, len(valid_dataset))))


In [None]:
train_dataset[4]

{'id': 52356,
 'patch': '@@ -219,59 +219,4 @@ public final class OperationMessageProcessor implements MessageProcessor, MuleCo\n     {\n         disposeIfNeeded(operationExecutor, LOGGER);\n     }\n-\n-    @Override\n-    public void setMuleContext(MuleContext muleContext)\n-    {\n-        this.muleContext = muleContext;\n-    }\n-\n-    /**\n-     * {@inheritDoc}\n-     */\n-    @Override\n-    public MetadataResult<List<MetadataKey>> getMetadataKeys() throws MetadataResolvingException\n-    {\n-        return metadataMediator.getMetadataKeys(getMetadataContext());\n-    }\n-\n-    /**\n-     * {@inheritDoc}\n-     */\n-    @Override\n-    public MetadataResult<OperationMetadataDescriptor> getMetadata() throws MetadataResolvingException\n-    {\n-        return metadataMediator.getMetadata();\n-    }\n-\n-    /**\n-     * {@inheritDoc}\n-     */\n-    @Override\n-    public MetadataResult<OperationMetadataDescriptor> getMetadata(MetadataKey key) throws MetadataResolvingException\n-  

In [None]:
valid_dataset[5]["text"]



Train model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, json
from trl import SFTTrainer
from transformers import (
    TrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
)
from unsloth import is_bfloat16_supported

os.makedirs(best_dir, exist_ok=True)


training_args = TrainingArguments(
    output_dir           = project_dir,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    warmup_steps         = 10,
    max_steps            = 500,
    learning_rate        = 1e-4,
    fp16                 = not is_bfloat16_supported(),
    bf16                 = is_bfloat16_supported(),
    optim                = "paged_adamw_8bit", # memory-efficient
    weight_decay         = 0.01,
    lr_scheduler_type    = "linear",
    logging_steps        = 10,
    eval_strategy        = "steps",
    eval_steps           = 10,
    save_strategy        = "steps",
    save_steps           = 10,
    save_total_limit     = 3,
    load_best_model_at_end = True,
    metric_for_best_model  = "loss",
    greater_is_better      = False,
    seed                 = 3407,
    report_to            = "none",
)

patience = 2
callbacks = [EarlyStoppingCallback(early_stopping_patience=patience)]


trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = train_dataset,
    eval_dataset       = valid_dataset,
    dataset_text_field = "text",
    max_seq_length     = max_seq_length,
    data_collator      = DataCollatorForSeq2Seq(tokenizer),
    dataset_num_proc   = 4,
    packing            = False,
    args               = training_args,
    callbacks          = callbacks,
)

Use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user\n",
    response_part = "<|im_start|>assistant\n",
)

In [None]:
trainer_stats = trainer.train()
trainer.model.save_pretrained(best_dir)
tokenizer.save_pretrained(best_dir)

 Testing on few samples

In [None]:
import json

test_dataset_path = "test_final.jsonl"
num_samples_to_test = 3
test_samples = []

try:
    with open(test_dataset_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_samples_to_test:
                break
            test_samples.append(json.loads(line))


for i, sample in enumerate(test_samples[:num_samples_to_test], start=1):
    print(f"--- Test Sample {i} ---")

    patch    = sample.get("patch", "")
    code     = number_lines(sample.get("code", ""))
    pmd_list = sample.get("pmdWarnings", [])
    formatted_pmd = "\n".join(pmd_list) if pmd_list else "No static analysis warnings provided."

    user_content = user_content_template.format(patch, formatted_pmd, code)
    messages = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user",   "content": user_content},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        padding=True,
    ).to("cuda")

    outputs = model.generate(
        inputs,
        max_new_tokens=256,
        use_cache=True,
        temperature=0.5,
        top_p=0.9,
    )

    # slice off the prompt tokens
    prompt_len = inputs.shape[1]
    gen_ids    = outputs[:, prompt_len:]

    reply = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
    print(reply, end="\n\n")


--- Test Sample 1 ---
NoComment

--- Test Sample 2 ---
The field names "VARCHAR" and "VARCHAR255" appear to be constants rather than specific string types. To maintain consistency with other type definitions in the codebase, consider defining these as actual constant fields, such as:

```
public static final Type VARCHAR = ...;
public static final Type VARCHAR_255 = ...;
```

--- Test Sample 3 ---
The implementation lacks thread safety. Static DateFormat objects should not be used directly due to their mutable nature. Consider either:

1. Using a thread-local formatter within the method
2. Implementing synchronization for concurrent access if necessary
3. Moving these utilities to non-static methods in a utility class

