In [None]:
from code_chunker import build_chunks
from data_processing import get_dataset, tokenize_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import tqdm
from collections import Counter
import tempfile
import sklearn.metrics
import os

In [None]:
tokenizer = AutoTokenizer.from_pretrained('qwen25_coder_1_5b_instruct')
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained('qwen25_coder_1_5b_instruct')

_, _, test = get_dataset()
test_dataset = tokenize_dataset(test, tokenizer)

In [None]:
def get_prediction(prompt, tokenizer, model):
    messages = [
        {"role": "system", "content": "Analyze the following C++ code and classify its vulnerability. Your classification should be one of the following: Improper Input Validation, Improper Limitation of a Pathname to a Restricted Directory (“Path Traversal”), Improper Neutralization of Special Elements used in an OS Command (“OS Command Injection”), Improper Neutralization of Input During Web Page Generation (“Cross-site Scripting”), Improper Neutralization of Special Elements used in an SQL Command (“SQL Injection”), Improper Control of Generation of Code (“Code Injection”), Improper Output Neutralization for Logs, Integer Overflow or Wraparound, NULL Pointer Dereference, Deserialization of Untrusted Data, URL Redirection to Untrusted Site (“Open Redirect”), Improper Restriction of XML External Entity Reference, Out-of-bounds Write, safe. Only respond with the classification."},
        {"role": "user", "content": prompt}

    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1024
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
y_true = []
y_pred = []

for sample in tqdm.tqdm(test_dataset):
    prompt = sample["code"]
    label = sample["output"].strip().lower()
    prediction = get_prediction(prompt, tokenizer, model).strip().lower()
    y_true.append(label)
    y_pred.append(prediction)

accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
precision = sklearn.metrics.precision_score(y_true, y_pred, average="weighted", zero_division=0)
recall = sklearn.metrics.recall_score(y_true, y_pred, average="weighted", zero_division=0)
f1 = sklearn.metrics.f1_score(y_true, y_pred, average="weighted", zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

def to_binary(label):
    return "safe" if label == "safe" else "unsafe"

y_true_binary = [to_binary(label) for label in y_true]
y_pred_binary = [to_binary(pred) for pred in y_pred]

binary_accuracy = sklearn.metrics.accuracy_score(y_true_binary, y_pred_binary)
binary_precision = sklearn.metrics.precision_score(y_true_binary, y_pred_binary, pos_label="safe", zero_division=0)
binary_recall = sklearn.metrics.recall_score(y_true_binary, y_pred_binary, pos_label="safe", zero_division=0)
binary_f1 = sklearn.metrics.f1_score(y_true_binary, y_pred_binary, pos_label="safe", zero_division=0)

print(f"Binary Accuracy (safe/unsafe): {binary_accuracy:.4f}")
print(f"Binary Precision (safe): {binary_precision:.4f}")
print(f"Binary Recall (safe): {binary_recall:.4f}")
print(f"Binary F1 Score (safe): {binary_f1:.4f}")

In [None]:
def predict_single_code(code, tokenizer, model):
    with tempfile.TemporaryDirectory() as directory:
        with open(os.path.join(directory, "main.cpp"), "w") as f:
            f.write(code)
        chunks = build_chunks(directory, tokenizer.encode, max_tokens=1024, overlap=128)
        responses = [get_prediction(chunk["code"], tokenizer, model).strip().lower() for chunk in chunks]
        return Counter(responses).most_common(1)[0][0]

In [None]:
y_true = []
y_pred = []
for sample in tqdm.tqdm(test_dataset):
    code = sample["code"]
    label = sample["output"].strip().lower()
    prediction = predict_single_code(code, tokenizer, model)
    y_true.append(label)
    y_pred.append(prediction)

accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
precision = sklearn.metrics.precision_score(y_true, y_pred, average="weighted", zero_division=0)
recall = sklearn.metrics.recall_score(y_true, y_pred, average="weighted", zero_division=0)
f1 = sklearn.metrics.f1_score(y_true, y_pred, average="weighted", zero_division=0)

print(f"Chunked Accuracy: {accuracy:.4f}")
print(f"Chunked Precision: {precision:.4f}")
print(f"Chunked Recall: {recall:.4f}")
print(f"Chunked F1 Score: {f1:.4f}")

def to_binary(label):
    return "safe" if label == "safe" else "unsafe"

y_true_binary = [to_binary(label) for label in y_true]
y_pred_binary = [to_binary(pred) for pred in y_pred]

binary_accuracy = sklearn.metrics.accuracy_score(y_true_binary, y_pred_binary)
binary_precision = sklearn.metrics.precision_score(y_true_binary, y_pred_binary, pos_label="safe", zero_division=0)
binary_recall = sklearn.metrics.recall_score(y_true_binary, y_pred_binary, pos_label="safe", zero_division=0)
binary_f1 = sklearn.metrics.f1_score(y_true_binary, y_pred_binary, pos_label="safe", zero_division=0)

print(f"Chunked Binary Accuracy (safe/unsafe): {binary_accuracy:.4f}")
print(f"Chunked Binary Precision (safe): {binary_precision:.4f}")
print(f"Chunked Binary Recall (safe): {binary_recall:.4f}")
print(f"Chunked Binary F1 Score (safe): {binary_f1:.4f}")