# Phase 2 - Model Evaluation

Evaluate trained model and test code completion quality.

In [1]:
from google.colab import files
import os

print("Upload test.jsonl and phase2_model.zip")
uploaded = files.upload()

!unzip -q phase2_model.zip
print("âœ“ Ready!")

Upload test.jsonl and phase2_model.zip


Saving phase2_model.zip to phase2_model.zip
Saving test.jsonl to test.jsonl
âœ“ Ready!


In [2]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps peft accelerate bitsandbytes
!pip install python-Levenshtein

In [3]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./final_model",
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)
print("Model loaded!")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.6: Fast Qwen2 patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

Unsloth 2025.11.6 patched 24 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model loaded!


In [4]:
from datasets import load_dataset
import numpy as np

test_dataset = load_dataset('json', data_files='test.jsonl', split='train')
print(f"Test samples: {len(test_dataset):,}")

Generating train split: 0 examples [00:00, ? examples/s]

Test samples: 4,217


In [5]:
from difflib import SequenceMatcher
import Levenshtein
def exact_match(prediction, reference):
    pred_clean = prediction.strip()
    ref_clean = reference.strip()
    return 1.0 if pred_clean == ref_clean else 0.0
def edit_similarity(prediction, reference):
    if not prediction and not reference:
        return 1.0
    if not prediction or not reference:
        return 0.0

    distance = Levenshtein.distance(prediction, reference)
    max_len = max(len(prediction), len(reference))
    return 1.0 - (distance / max_len)
def perfect_lines(prediction, reference):
    pred_lines = prediction.strip().split('\n')
    ref_lines = reference.strip().split('\n')

    if not ref_lines:
        return 0.0

    matches = sum(1 for p, r in zip(pred_lines, ref_lines) if p.strip() == r.strip())
    return matches / len(ref_lines)
def matched_ratio(prediction, reference):

    return SequenceMatcher(None, prediction, reference).ratio()
print("Metrics functions loaded!")
print("\nAvailable metrics:")
print("  EM (Exact Match)")
print("  ES (Edit Similarity)")
print("  PL (Perfect Lines)")
print("  MR (Matched Ratio)")

Metrics functions loaded!

Available metrics:
  EM (Exact Match)
  ES (Edit Similarity)
  PL (Perfect Lines)
  MR (Matched Ratio)


In [12]:
from tqdm.auto import tqdm
test_subset = test_dataset.shuffle(seed=42).select(range(min(500, len(test_dataset))))
print(f"Evaluating on {len(test_subset)} samples\n")
results = {'em': [], 'es': [], 'pl': [], 'mr': []}
def parse_fim(text):
    prefix = text.split('<PRE>')[1].split('<SUF>')[0].strip() if '<PRE>' in text else ""
    suffix = text.split('<SUF>')[1].split('<MID>')[0].strip() if '<SUF>' in text else ""
    middle = text.split('<MID>')[1].split('<')[0].strip() if '<MID>' in text else ""
    return prefix, suffix, middle
def generate_completion(prefix, suffix):
    prompt = f"<PRE> {prefix} <SUF> {suffix} <MID>"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=32, temperature=0.2, pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return result.split('<MID>')[1].split('<')[0].strip() if '<MID>' in result else ""
for sample in tqdm(test_subset, desc="Evaluating"):
    prefix, suffix, reference = parse_fim(sample['text'])
    prediction = generate_completion(prefix, suffix)

    results['em'].append(exact_match(prediction, reference))
    results['es'].append(edit_similarity(prediction, reference))
    results['pl'].append(perfect_lines(prediction, reference))
    results['mr'].append(matched_ratio(prediction, reference))
print(f"\n{'='*50}")
print("EVALUATION RESULTS:")
print(f"{'='*50}")
print(f"Exact Match (EM):      {sum(results['em'])/len(results['em'])*100:.2f}%")
print(f"Edit Similarity (ES):  {sum(results['es'])/len(results['es'])*100:.2f}%")
print(f"Perfect Lines (PL):    {sum(results['pl'])/len(results['pl'])*100:.2f}%")
print(f"Matched Ratio (MR):    {sum(results['mr'])/len(results['mr'])*100:.2f}%")
print(f"{'='*50}")

Evaluating on 500 samples



Evaluating:   0%|          | 0/500 [00:00<?, ?it/s]


EVALUATION RESULTS:
Exact Match (EM):      9.60%
Edit Similarity (ES):  19.68%
Perfect Lines (PL):    10.47%
Matched Ratio (MR):    22.41%


In [13]:
def complete(prefix, suffix=""):
    prompt = f"<PRE> {prefix} <SUF> {suffix} <MID>"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=32,
        temperature=0.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    result = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if "<MID>" in result:
        return result.split("<MID>")[1].split("<")[0].strip()
    return result
print("Python test:")
print(complete("def calculate_sum(a, b):\n    "))
print("\nJava test:")
print(complete("public class User {\n    private String "))
print("\nC++ test:")
print(complete("#include <iostream>\nint main() {\n    "))

Python test:
return a + b

def main():
    print(calculate_sum(10, 20))

if __name__ == "__main__":
    main

Java test:
name;  
    private String password;  
    private String email;  
    private String phone;  
    private String address;  
    private String status;

C++ test:
int a = 10;
  int b = 20;
  int c = 30;
  int d = 40;


## Results Summary

âœ… Phase 2 Complete!

**Next**: Phase 3 - Merge model and convert to GGUF for deployment