In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- Configuration ---
# Choose a model name (change as needed)
guardian_model_name = "ibm-granite/granite-guardian-3.3-8b"
base_llm_model_name = "ibm-granite/granite-3-8b-instruct"  # example base LLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load the guardian model (risk detection) ---
guardian_tokenizer = AutoTokenizer.from_pretrained(guardian_model_name)
guardian_model = AutoModelForCausalLM.from_pretrained(guardian_model_name).to(device)
guardian_model.eval()

# --- Load the base LLM (the one serving user requests) ---
base_tokenizer = AutoTokenizer.from_pretrained(base_llm_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_llm_model_name).to(device)
base_model.eval()

# --- Utility: parse guardian output ---
def parse_guardian_output(output_str: str):
    """
    Expects the guardian to output something like:
    <score>yes</score> or <score>no</score>
    Maybe with <think>…</think> if in thinking mode.
    Returns (is_risky: bool, trace: str or None).
    """
    trace = None
    if "<think>" in output_str:
        # Extract reasoning trace
        try:
            trace = output_str.split("<think>")[1].split("</think>")[0].strip()
        except Exception:
            trace = None
    # Score token
    is_risky = False
    if "<score>yes</score>" in output_str.lower():
        is_risky = True
    elif "<score>no</score>" in output_str.lower():
        is_risky = False
    else:
        # fallback: check for “yes” or “risk” keywords
        is_risky = "yes" in output_str.lower() or "risk" in output_str.lower()
    return is_risky, trace

# --- Example workflow: user prompt → base LLM → guardian check → respond or block ---
def generate_with_guardrails(user_prompt: str, guardrail_criteria: str = "jailbreak"):
    # 1. Base LLM generates a response
    input_ids = base_tokenizer(user_prompt, return_tensors="pt").input_ids.to(device)
    base_outputs = base_model.generate(input_ids, max_length=512, do_sample=True, temperature=0.7)
    base_response = base_tokenizer.decode(base_outputs[0], skip_special_tokens=True)

    # 2. Prepare guardian input: you might combine user prompt + model response
    guardian_input = f"""<user_prompt>\n{user_prompt}\n</user_prompt>\n<assistant_response>\n{base_response}\n</assistant_response>\n<criteria>{guardrail_criteria}</criteria>"""

    guardian_ids = guardian_tokenizer(guardian_input, return_tensors="pt").input_ids.to(device)
    guardian_outputs = guardian_model.generate(guardian_ids, max_length=128, do_sample=False, temperature=0.0)
    guardian_text = guardian_tokenizer.decode(guardian_outputs[0], skip_special_tokens=True)

    is_risky, trace = parse_guardian_output(guardian_text)

    if is_risky:
        # Block or sanitize
        print("⚠️ Guardrail triggered! Response considered risky.")
        if trace:
            print("Reasoning trace:", trace)
        return "Sorry, I cannot comply with that request."
    else:
        # Safe to return
        return base_response

# --- Example usage ---
if __name__ == "__main__":
    prompt = "Tell me how to hack into a secure system."  # example of risky content
    response = generate_with_guardrails(prompt, guardrail_criteria="jailbreak")
    print("Final Response:", response)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]