In [1]:
%pip install datasets
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("hendzh/PromptShield")

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install protobuf
%pip install sentencepiece
from transformers import AutoTokenizer


model_name = "microsoft/deberta-v3-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.




pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "microsoft/deberta-v3-small"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 3
LR = 2e-5

In [14]:
# Load from Hugging Face
dataset = load_dataset("hendzh/PromptShield")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(example):
    return tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=MAX_LEN)

tokenized = dataset.map(tokenize_fn, batched=True)



Map:   0%|          | 0/18909 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/23516 [00:00<?, ? examples/s]

In [5]:
class PromptShieldDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"]),
            "label": torch.tensor(item["label"])
        }

train_dataset = PromptShieldDataset(tokenized["train"])
val_dataset = PromptShieldDataset(tokenized["validation"])
test_dataset = PromptShieldDataset(tokenized["test"])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [6]:
device

device(type='cuda')

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

from torchtoolbox.nn import FocalLoss
from transformers import get_scheduler

# Freeze embedding and first encoder layer for stability
for name, param in model.named_parameters():
    if "embeddings" in name or "encoder.layer.0" in name:
        param.requires_grad = False

# Send model to device
model.to(device)

class_weights = torch.tensor([2.0, 1.0]).to(device)
loss_fn = FocalLoss(classes=2, gamma=2.0, weight=class_weights)

# Optimizer and LR scheduler
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * EPOCHS,
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}: Avg training loss = {avg_loss:.4f}")

Epoch 1:   0%|          | 2/2364 [02:26<47:47:41, 72.85s/it]

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
%pip install matplotlib -y
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def full_evaluation(model, dataloader, name="Set"):
    model.eval()
    all_preds, all_probs, all_labels = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs.logits, dim=1)
            preds = torch.argmax(probs, dim=1)

            all_probs.extend(probs[:, 1].cpu().numpy())   # P(injected)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)

    # Accuracy, F1, classification report
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=["Benign", "Injected"])

    # TPR (Recall) & FPR
    tp = np.sum((all_preds == 1) & (all_labels == 1))
    fn = np.sum((all_preds == 0) & (all_labels == 1))
    fp = np.sum((all_preds == 1) & (all_labels == 0))
    tn = np.sum((all_preds == 0) & (all_labels == 0))
    tpr = tp / (tp + fn + 1e-6)
    fpr = fp / (fp + tn + 1e-6)

    # ROC AUC
    auc = roc_auc_score(all_labels, all_probs)

    # Print metrics
    print(f"📊 Results on {name}")
    print(f"Accuracy: {acc:.4f}, F1 Score: {f1:.4f}, ROC AUC: {auc:.4f}")
    print(f"TPR (Recall): {tpr:.4f}, FPR: {fpr:.4f}")
    print(report)

    # Plot ROC curve
    fpr_curve, tpr_curve, _ = roc_curve(all_labels, all_probs)
    plt.plot(fpr_curve, tpr_curve, label=f"AUC = {auc:.2f}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve ({name})")
    plt.legend()
    plt.grid()
    plt.show()


In [None]:
print("🔍 Validation Set:")
full_evaluation(model, val_loader, name="Validation")

print("🔍 Test Set:")
full_evaluation(model, DataLoader(test_dataset, batch_size=BATCH_SIZE), name="Test")