# Phase 2 - Model Evaluation (Offline Version)

Evaluate trained model and test code completion quality.

In [None]:
import os
import torch
import numpy as np

# === CONFIG ===
MODEL_PATH = './final_model'
TEST_PATH = './split_data/test.jsonl'

print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Install evaluation dependencies if needed
try:
    import Levenshtein
except ImportError:
    import subprocess
    subprocess.check_call(['pip', 'install', 'python-Levenshtein'])
    import Levenshtein

In [None]:
from unsloth import FastLanguageModel

print(f"Loading model from {MODEL_PATH}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_PATH,
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
print("✓ Model loaded!")

In [None]:
from datasets import load_dataset

test_dataset = load_dataset('json', data_files=TEST_PATH, split='train')
print(f"Test samples: {len(test_dataset):,}")

In [None]:
from difflib import SequenceMatcher

def exact_match(prediction, reference):
    return 1.0 if prediction.strip() == reference.strip() else 0.0

def edit_similarity(prediction, reference):
    if not prediction and not reference:
        return 1.0
    if not prediction or not reference:
        return 0.0
    distance = Levenshtein.distance(prediction, reference)
    max_len = max(len(prediction), len(reference))
    return 1.0 - (distance / max_len)

def perfect_lines(prediction, reference):
    pred_lines = prediction.strip().split('\n')
    ref_lines = reference.strip().split('\n')
    if not ref_lines:
        return 0.0
    matches = sum(1 for p, r in zip(pred_lines, ref_lines) if p.strip() == r.strip())
    return matches / len(ref_lines)

def matched_ratio(prediction, reference):
    return SequenceMatcher(None, prediction, reference).ratio()

print("✓ Metrics loaded")

In [None]:
from tqdm.auto import tqdm

# Evaluate on subset
test_subset = test_dataset.shuffle(seed=42).select(range(min(500, len(test_dataset))))
print(f"Evaluating on {len(test_subset)} samples")

results = {'em': [], 'es': [], 'pl': [], 'mr': []}

def parse_fim(text):
    prefix = text.split('<PRE>')[1].split('<SUF>')[0].strip() if '<PRE>' in text else ""
    suffix = text.split('<SUF>')[1].split('<MID>')[0].strip() if '<SUF>' in text else ""
    middle = text.split('<MID>')[1].split('<')[0].strip() if '<MID>' in text else ""
    return prefix, suffix, middle

def generate_completion(prefix, suffix):
    prompt = f"<PRE> {prefix} <SUF> {suffix} <MID>"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=32, temperature=0.2, pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return result.split('<MID>')[1].split('<')[0].strip() if '<MID>' in result else ""

for sample in tqdm(test_subset, desc="Evaluating"):
    prefix, suffix, reference = parse_fim(sample['text'])
    prediction = generate_completion(prefix, suffix)
    
    results['em'].append(exact_match(prediction, reference))
    results['es'].append(edit_similarity(prediction, reference))
    results['pl'].append(perfect_lines(prediction, reference))
    results['mr'].append(matched_ratio(prediction, reference))

print(f"\n{'='*50}")
print("EVALUATION RESULTS:")
print(f"{'='*50}")
print(f"Exact Match (EM):      {sum(results['em'])/len(results['em'])*100:.2f}%")
print(f"Edit Similarity (ES):  {sum(results['es'])/len(results['es'])*100:.2f}%")
print(f"Perfect Lines (PL):    {sum(results['pl'])/len(results['pl'])*100:.2f}%")
print(f"Matched Ratio (MR):    {sum(results['mr'])/len(results['mr'])*100:.2f}%")
print(f"{'='*50}")

In [None]:
# Quick test
def complete(prefix, suffix=""):
    prompt = f"<PRE> {prefix} <SUF> {suffix} <MID>"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=32, temperature=0.2, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if "<MID>" in result:
        return result.split("<MID>")[1].split("<")[0].strip()
    return result

print("Python test:")
print(complete("def calculate_sum(a, b):\n    "))
print("\nJava test:")
print(complete("public class User {\n    private String "))
print("\nC++ test:")
print(complete("#include <iostream>\nint main() {\n    "))