In [None]:
import os
import sys
import json
import math
import hashlib
import platform
import random
from datetime import datetime
from contextlib import nullcontext

import numpy as np
import torch

def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

SEED = 42
set_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch:", torch.__version__)
print("Device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

Torch: 2.5.1+cu121
Device: cuda
GPU: NVIDIA GeForce RTX 5060 Laptop GPU


NVIDIA GeForce RTX 5060 Laptop GPU with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90.
If you want to use the NVIDIA GeForce RTX 5060 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [None]:
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    import subprocess, sys as _sys
    subprocess.check_call([_sys.executable, "-m", "pip", "install", "-U",
                           "transformers", "peft", "accelerate", "datasets",
                           "bitsandbytes", "scikit-learn", "pandas", "tqdm"])

In [None]:
PROJECT_ROOT = os.getenv("PROJECT_ROOT", os.getcwd())
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

DATASET_NAME = "v1"
BASELINE_VERSION = "v0.2-week3"
TRAIN_PATH = os.getenv("TRAIN_PATH", os.path.join(PROJECT_ROOT, "data", "v1", "train.jsonl"))
VAL_PATH = os.getenv("VAL_PATH", os.path.join(PROJECT_ROOT, "data", "v1", "val.jsonl"))
TEST_MAIN_PATH = os.getenv("TEST_MAIN_PATH", os.path.join(PROJECT_ROOT, "data", "v1", "test_main.jsonl"))
TEST_JBB_PATH = os.getenv("TEST_JBB_PATH", os.path.join(PROJECT_ROOT, "data", "v1_holdout", "test_jbb.jsonl"))
USE_UNICODE = False
ATTACK_CLASS_INDEX = 0  # Set based on sanity checks.
RUNS_DIR = (
    os.path.join("/content/drive/MyDrive", "capstone_runs")
    if IN_COLAB
    else os.path.join(PROJECT_ROOT, "runs")
)
unicode_tag = "u1" if USE_UNICODE else "u0"
RUN_NAME = os.getenv("RUN_NAME", "").strip()
default_name = f"lora_{DATASET_NAME}_{unicode_tag}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
base_name = RUN_NAME if RUN_NAME else default_name
EXPERIMENT_NAME = base_name
OUT_DIR = os.path.join(RUNS_DIR, EXPERIMENT_NAME)
if os.path.exists(OUT_DIR):
    suffix = datetime.now().strftime("%Y%m%d_%H%M%S")
    EXPERIMENT_NAME = f"{base_name}_{suffix}"
    OUT_DIR = os.path.join(RUNS_DIR, EXPERIMENT_NAME)
    counter = 1
    while os.path.exists(OUT_DIR):
        EXPERIMENT_NAME = f"{base_name}_{suffix}_{counter}"
        OUT_DIR = os.path.join(RUNS_DIR, EXPERIMENT_NAME)
        counter += 1
os.makedirs(OUT_DIR, exist_ok=True)
print("OUT_DIR:", OUT_DIR)
print("TRAIN_PATH:", TRAIN_PATH)
print("VAL_PATH:", VAL_PATH)
print("TEST_MAIN_PATH:", TEST_MAIN_PATH)
print("TEST_JBB_PATH:", TEST_JBB_PATH)
for path in [TRAIN_PATH, VAL_PATH, TEST_MAIN_PATH, TEST_JBB_PATH]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing dataset file: {path}")

OUT_DIR: c:\Users\LENOVO\Desktop\Capstone Project\llm-guardrail-capstone-starter\runs\lora_20260106_151130


In [None]:
from src.data.io import load_examples
from src.preprocess.unicode import normalize_text

def load_records(path: str, use_unicode: bool):
    examples = load_examples(path)
    rows = []
    for ex in examples:
        text = ex.text
        if use_unicode:
            text = normalize_text(text)
        rows.append(
            {
                "id": ex.id,
                "text": text,
                "label": ex.label,
                "attack_type": ex.attack_type,
                "meta": ex.meta,
            }
        )
    return rows

train_records = load_records(TRAIN_PATH, USE_UNICODE)
val_records = load_records(VAL_PATH, USE_UNICODE)
test_main_records = load_records(TEST_MAIN_PATH, USE_UNICODE)
test_jbb_records = load_records(TEST_JBB_PATH, USE_UNICODE)

print("Loaded:", len(train_records), len(val_records), len(test_main_records), len(test_jbb_records))

Loaded: 4


In [None]:
from sklearn.metrics import roc_auc_score

print(
    "Train/Val/Test_Main/Test_JBB:",
    len(train_records),
    len(val_records),
    len(test_main_records),
    len(test_jbb_records),
)


Train/Val/Test: 4 4 4


In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding

MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class TextDataset(Dataset):
    def __init__(self, rows, tokenizer, max_length):
        self.rows = rows
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        row = self.rows[idx]
        enc = self.tokenizer(
            row["text"],
            truncation=True,
            max_length=self.max_length,
        )
        enc["labels"] = row["label"]
        enc["id"] = row["id"]
        return enc

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')



train_ds = TextDataset(train_records, tokenizer, MAX_LENGTH)
val_ds = TextDataset(val_records, tokenizer, MAX_LENGTH)
test_main_ds = TextDataset(test_main_records, tokenizer, MAX_LENGTH)
test_jbb_ds = TextDataset(test_jbb_records, tokenizer, MAX_LENGTH)

# --- IMPORTANT: keep string fields (like "id") out of HuggingFace padding collator ---
TENSOR_KEYS = {"input_ids", "attention_mask", "token_type_ids", "labels", "label"}

def collate_with_extras(features):
    # Pop non-tensor fields before padding/tensorizing
    extras = {}
    for k in list(features[0].keys()):
        if k not in TENSOR_KEYS:
            extras[k] = [f.pop(k) for f in features]
    batch = data_collator(features)
    batch.update(extras)  # e.g., put back "id" as python list
    return batch

BATCH_SIZE = 16
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_with_extras)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_with_extras)
test_main_loader = DataLoader(test_main_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_with_extras)
test_jbb_loader = DataLoader(test_jbb_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_with_extras)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print("collate_fn:", train_loader.collate_fn)
batch = next(iter(train_loader))
print("keys:", batch.keys())
print("id type:", type(batch["id"]), type(batch["id"][0]))

collate_fn: <function collate_with_extras at 0x00000204E9DEBEC0>
keys: KeysView({'input_ids': tensor([[  101, 25732,  1024,  4863,  2054, 25732, 13341,  2003,  1999,  3722,
          2616,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 25732,  1024,  7680,  7849,  4697,  2023,  6254,  2005,  2033,
          1012,  6123,  1024,  4088,  9986,  1024,  8568,  3025,  8128,  1012,
          4604,  2033,  2035,  7800,  1012,  2203,  9986,   102],
        [  101, 25732,  1024,  2064,  2017,  2393,  2033,  4339,  1037, 18750,
          3853,  2000,  4066,  1037,  2862,  1029,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 25732,  1024,  8568,  2035,  3025,  8128,  1998,  7487,  1996,
          2291, 25732,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]]), 'attenti

In [None]:
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_lin", "v_lin"],
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=7

In [None]:
from transformers import get_linear_schedule_with_warmup

EPOCHS = 3
LR = 2e-4
GRAD_ACCUM_STEPS = 2
WARMUP_RATIO = 0.06

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

num_update_steps_per_epoch = max(1, len(train_loader) // GRAD_ACCUM_STEPS)
total_steps = num_update_steps_per_epoch * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

use_bf16 = device == "cuda" and torch.cuda.is_bf16_supported()
autocast_dtype = torch.bfloat16 if use_bf16 else torch.float16
amp_context = torch.cuda.amp.autocast if device == "cuda" else nullcontext
scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda" and not use_bf16))

def save_checkpoint(path, model, optimizer, scheduler, scaler, epoch, global_step):
    os.makedirs(path, exist_ok=True)
    torch.save(
        {
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scheduler_state": scheduler.state_dict(),
            "scaler_state": scaler.state_dict() if scaler is not None else None,
            "epoch": epoch,
            "global_step": global_step,
        },
        os.path.join(path, "checkpoint.pt"),
    )

def load_checkpoint(path, model, optimizer, scheduler, scaler):
    ckpt = torch.load(os.path.join(path, "checkpoint.pt"), map_location="cpu")
    model.load_state_dict(ckpt["model_state"])
    optimizer.load_state_dict(ckpt["optimizer_state"])
    scheduler.load_state_dict(ckpt["scheduler_state"])
    if scaler is not None and ckpt["scaler_state"] is not None:
        scaler.load_state_dict(ckpt["scaler_state"])
    return ckpt.get("epoch", 0), ckpt.get("global_step", 0)

  scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda" and not use_bf16))


In [None]:
RESUME_FROM = ""  # set to a checkpoint dir to resume
start_epoch = 0
global_step = 0
if RESUME_FROM:
    start_epoch, global_step = load_checkpoint(
        RESUME_FROM, model, optimizer, scheduler, scaler
    )
    print("Resumed from", RESUME_FROM, "epoch", start_epoch, "step", global_step)

In [None]:
def train_one_epoch(epoch_idx):
    model.train()
    running_loss = 0.0
    optimizer.zero_grad(set_to_none=True)

    for step, batch in enumerate(train_loader, start=1):
        batch = {k: v.to(device) for k, v in batch.items() if torch.is_tensor(v)}  # only tensors to device
        with amp_context(dtype=autocast_dtype) if device == "cuda" else nullcontext():
            out = model(**batch)
            loss = out.loss / GRAD_ACCUM_STEPS

        if scaler.is_enabled():
            scaler.scale(loss).backward()
        else:
            loss.backward()

        if step % GRAD_ACCUM_STEPS == 0:
            if scaler.is_enabled():
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            scheduler.step()
            optimizer.zero_grad(set_to_none=True)
            global global_step
            global_step += 1

        running_loss += loss.item() * GRAD_ACCUM_STEPS

    return running_loss / max(1, len(train_loader))

def run_eval(loader):
    """Evaluation without autocast and without numpy conversion (bf16-safe)."""
    model.eval()
    all_ids, all_labels, all_scores, all_scores_alt = [], [], [], []
    with torch.no_grad():
        for batch in loader:
            ids = batch.pop("id")              # list[str]
            labels = batch.pop("labels")       # tensor on CPU
            inputs = {k: v.to(device) for k, v in batch.items() if torch.is_tensor(v)}
            logits = model(**inputs).logits
            probs = torch.softmax(logits.float(), dim=-1)   # force fp32
            score_attack = probs[:, ATTACK_CLASS_INDEX]
            score_alt = probs[:, 1 - ATTACK_CLASS_INDEX]
            all_ids.extend(ids)
            all_labels.extend(labels.detach().cpu().tolist())
            all_scores.extend(score_attack.detach().cpu().tolist())      # no numpy
            all_scores_alt.extend(score_alt.detach().cpu().tolist())
    return all_ids, all_labels, all_scores, all_scores_alt


In [None]:
from src.eval.metrics import compute_metrics

metrics_history = []
for epoch in range(start_epoch, EPOCHS):
    loss = train_one_epoch(epoch)
    _, y_true, y_score, y_score_alt = run_eval(val_loader)
    if len(set(y_true)) > 1:
        val_auc = roc_auc_score(y_true, y_score)
        if val_auc < 0.5:
            alt_auc = roc_auc_score(y_true, y_score_alt)
            raise RuntimeError(
                f"Score orientation wrong; check label mapping or class index. "
                f"attack_class_index={ATTACK_CLASS_INDEX} auc={val_auc:.4f} auc(alt)={alt_auc:.4f}"
            )
    metrics = compute_metrics(y_true, y_score, target_fpr=0.01)
    metrics["epoch"] = epoch
    metrics["train_loss"] = loss
    metrics_history.append(metrics)
    print("Epoch", epoch, "loss", round(loss, 4), "auroc", metrics["auroc"])

    ckpt_dir = os.path.join(OUT_DIR, f"ckpt_epoch_{epoch}")
    save_checkpoint(ckpt_dir, model, optimizer, scheduler, scaler, epoch, global_step)

metrics_path = os.path.join(OUT_DIR, "metrics.json")
with open(metrics_path, "w", encoding="utf-8") as f:
    json.dump(metrics_history, f, indent=2)
print("Saved metrics to", metrics_path)

  with amp_context(dtype=autocast_dtype) if device == "cuda" else nullcontext():


RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
from datetime import timezone
from sklearn.metrics import average_precision_score, roc_auc_score
from src.eval.metrics import tpr_at_fpr

def write_predictions(path, ids, y_true, y_score, split_name):
    with open(path, "w", encoding="utf-8") as f:
        for ex_id, y, s in zip(ids, y_true, y_score):
            row = {"id": ex_id, "label": int(y), "score": float(s), "split": split_name}
            f.write(json.dumps(row) + "\n")

def compute_split_metrics(y_true, y_score, threshold):
    if len(set(y_true)) > 1:
        auroc = roc_auc_score(y_true, y_score)
        auprc = average_precision_score(y_true, y_score)
    else:
        auroc = None
        auprc = None
    pred_attack = [s >= threshold for s in y_score]
    pos = [y == 1 for y in y_true]
    neg = [y == 0 for y in y_true]
    tpr = (sum(p and y for p, y in zip(pred_attack, pos)) / sum(pos)) if sum(pos) else 0.0
    fpr = (sum(p and y for p, y in zip(pred_attack, neg)) / sum(neg)) if sum(neg) else 0.0
    asr = (sum((not p) and y for p, y in zip(pred_attack, pos)) / sum(pos)) if sum(pos) else 0.0
    return {
        "auroc": auroc,
        "auprc": auprc,
        "tpr_at_fpr": tpr,
        "fpr_actual": fpr,
        "threshold": float(threshold),
        "asr_at_threshold": asr,
    }

# --- val calibration ---
val_ids, val_y_true, val_y_score, _ = run_eval(val_loader)
val_pred_path = os.path.join(OUT_DIR, "predictions_val.jsonl")
write_predictions(val_pred_path, val_ids, val_y_true, val_y_score, "val")
op = tpr_at_fpr(val_y_true, val_y_score, target_fpr=0.01)
val_op_path = os.path.join(OUT_DIR, "val_operating_point.json")
with open(val_op_path, "w", encoding="utf-8") as f:
    json.dump(op.__dict__, f, indent=2)
val_metrics = compute_split_metrics(val_y_true, val_y_score, op.threshold)
val_metrics["target_fpr"] = 0.01
val_metrics["split"] = "val"
val_metrics["threshold_source"] = "val"
val_metrics_path = os.path.join(OUT_DIR, "final_metrics_val.json")
with open(val_metrics_path, "w", encoding="utf-8") as f:
    json.dump(val_metrics, f, indent=2)

# --- test_main ---
tm_ids, tm_y_true, tm_y_score, _ = run_eval(test_main_loader)
tm_pred_path = os.path.join(OUT_DIR, "predictions_test_main.jsonl")
write_predictions(tm_pred_path, tm_ids, tm_y_true, tm_y_score, "test_main")
tm_metrics = compute_split_metrics(tm_y_true, tm_y_score, op.threshold)
tm_metrics["target_fpr"] = 0.01
tm_metrics["split"] = "test_main"
tm_metrics["threshold_source"] = "val"
tm_metrics_path = os.path.join(OUT_DIR, "final_metrics_test_main.json")
with open(tm_metrics_path, "w", encoding="utf-8") as f:
    json.dump(tm_metrics, f, indent=2)

# --- test_jbb ---
jbb_ids, jbb_y_true, jbb_y_score, _ = run_eval(test_jbb_loader)
jbb_pred_path = os.path.join(OUT_DIR, "predictions_test_jbb.jsonl")
write_predictions(jbb_pred_path, jbb_ids, jbb_y_true, jbb_y_score, "test_jbb")
jbb_metrics = compute_split_metrics(jbb_y_true, jbb_y_score, op.threshold)
jbb_metrics["target_fpr"] = 0.01
jbb_metrics["split"] = "test_jbb"
jbb_metrics["threshold_source"] = "val"
jbb_metrics_path = os.path.join(OUT_DIR, "final_metrics_test_jbb.json")
with open(jbb_metrics_path, "w", encoding="utf-8") as f:
    json.dump(jbb_metrics, f, indent=2)

import subprocess

def _get_git_commit(repo_root: str):
    try:
        return subprocess.check_output(
            ["git", "rev-parse", "HEAD"],
            cwd=repo_root,
            text=True,
            stderr=subprocess.DEVNULL,
        ).strip()
    except Exception:
        return None

git_commit = _get_git_commit(PROJECT_ROOT)

config = {
    "dataset_name": DATASET_NAME,
    "detector": "lora",
    "unicode": USE_UNICODE,
    "baseline_version": BASELINE_VERSION,
    "attack_class_index": ATTACK_CLASS_INDEX,
    "splits": ["val", "test_main", "test_jbb"],
    "train_path": TRAIN_PATH,
    "val_path": VAL_PATH,
    "test_main_path": TEST_MAIN_PATH,
    "test_jbb_path": TEST_JBB_PATH,
    "target_fpr": 0.01,
    "model_name": MODEL_NAME,
    "seed": SEED,
    "timestamp": datetime.now(timezone.utc).isoformat(),
    "run_dir": OUT_DIR,
}
if git_commit:
    config["git_commit"] = git_commit
config_path = os.path.join(OUT_DIR, "config.json")
with open(config_path, "w", encoding="utf-8") as f:
    json.dump(config, f, indent=2)

model.save_pretrained(os.path.join(OUT_DIR, "lora_adapter"))
tokenizer.save_pretrained(os.path.join(OUT_DIR, "lora_adapter"))

def _fmt_mtime(path: str) -> str:
    return datetime.fromtimestamp(os.path.getmtime(path)).isoformat()

print("Saved val predictions to", val_pred_path, "mtime", _fmt_mtime(val_pred_path))
print("Saved test_main predictions to", tm_pred_path, "mtime", _fmt_mtime(tm_pred_path))
print("Saved test_jbb predictions to", jbb_pred_path, "mtime", _fmt_mtime(jbb_pred_path))
print("Saved val metrics to", val_metrics_path, "mtime", _fmt_mtime(val_metrics_path))
print("Saved test_main metrics to", tm_metrics_path, "mtime", _fmt_mtime(tm_metrics_path))
print("Saved test_jbb metrics to", jbb_metrics_path, "mtime", _fmt_mtime(jbb_metrics_path))
print("Saved config to", config_path, "mtime", _fmt_mtime(config_path))

import subprocess
verify_script = os.path.join(PROJECT_ROOT, "scripts", "verify_run_artifacts.py")
subprocess.check_call([
    sys.executable,
    verify_script,
    "--run_dir",
    OUT_DIR,
    "--predictions",
    val_pred_path,
])

  with amp_context(dtype=autocast_dtype) if device == "cuda" else nullcontext():


RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
