# Step 1 Evaluation

In [20]:
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import os
import signal
import subprocess
import sys
import time
from typing import List
from pathlib import Path
import shutil

import openai
from tqdm import tqdm

In [15]:
# 7. Run Baseline Evaluation - Run Evaluations in Background
BASE_DIR = "./workspace/"
DATASET_DIR = f"{BASE_DIR}/dataset/"
MODEL_NAME_OR_PATH = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
MODEL_TAG_NAME = MODEL_NAME_OR_PATH.split("/")[-1]
MODEL_OUTPUT_DIR = f"{BASE_DIR}/results/{MODEL_TAG_NAME}/"
LOG_DIR = f"{MODEL_OUTPUT_DIR}/logs/"

# * Dataset
AEGIS_V2_TEST_DIR = f"{DATASET_DIR}/aegis_v2"

# * Content Safety benchmark
CONTENT_SAFETY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/content-safety-evals"
AEGIS_V2_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/aegis_v2"
WILDGUARD_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/wildguard"

# * Security benchmark
SECURITY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/security-evals"
GARAK_RESULTS_DIR = f"{SECURITY_RESULTS_DIR}/garak"
GARAK_CONFIG_DIR = f"{GARAK_RESULTS_DIR}/configs"
GARAK_LOG_DIR = f"{GARAK_RESULTS_DIR}/logs"
GARAK_REPORT_DIR = f"{GARAK_RESULTS_DIR}/reports"

# * Accuracy benchmark
ACCURACY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/accuracy-evals"
GPQA_DIAMOND_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/gpqa-diamond"
AA_MATH_500_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/aa-math-500"
IFEVAL_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/ifeval"

In [16]:
# Create directories to store logs and results
!mkdir -p {LOG_DIR}
!mkdir -p {AEGIS_V2_TEST_DIR}
!mkdir -p {AEGIS_V2_RESULTS_DIR}
!mkdir -p {WILDGUARD_RESULTS_DIR}
!mkdir -p {GARAK_RESULTS_DIR}
!mkdir -p {GARAK_CONFIG_DIR}
!mkdir -p {GARAK_LOG_DIR}
!mkdir -p {GARAK_REPORT_DIR}
!mkdir -p {GPQA_DIAMOND_RESULTS_DIR}
!mkdir -p {AA_MATH_500_RESULTS_DIR}
!mkdir -p {IFEVAL_RESULTS_DIR}

- Create NVIDIA API Key
- Add Hugging Face token
- 

In [4]:
# Credentials
os.environ.update({
    "MY_API_KEY":"empty",
    "JUDGE_API_KEY": "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",
    "HF_TOKEN": "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"
})

os.environ.update({
    'BASE_DIR': f"{BASE_DIR}",
    'TMPDIR': f"{BASE_DIR}/tmp",
    'XDG_CACHE_HOME': f"{BASE_DIR}/cache",
    'HF_HOME': f"{BASE_DIR}/cache/huggingface",
    'UV_CACHE_DIR': f"{BASE_DIR}/cache/uv",
    'TRITON_CACHE_DIR': f"{BASE_DIR}/cache/triton",
    'DATASET_CACHE_DIR': f"{BASE_DIR}/dataset_cache",
    'RAY_TMPDIR': "/tmp/ray",
    'LOG_DIR': f"{LOG_DIR}"
})

## Launch vLLM server

Explain why

```
export MODEL_NAME_OR_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
export VLLM_TENSOR_PARALLEL_SIZE=8
export HF_HOME=./workspace/cache/huggingface
export LOG_DIR=./workspace/logs
python3 -m vllm.entrypoints.openai.api_server \
  --model "$MODEL_NAME_OR_PATH" \
  --trust-remote-code \
  --seed 1 \
  --host "$VLLM_HOST" \
  --port 5000 \
  --served-model-name "test-model" \
  --enable-reasoning \
  --reasoning-parser qwen3 \
  --tensor-parallel-size "$VLLM_TENSOR_PARALLEL_SIZE" \
  --download-dir="$HF_HOME"
```

In [10]:
# VLLM Host 
os.environ.update({
    'VLLM_ENGINE_ITERATION_TIMEOUT_S': '36000',
    'VLLM_ALLOW_LONG_MAX_MODEL_LEN': '1',
    'VLLM_HOST': '0.0.0.0',
    'VLLM_TENSOR_PARALLEL_SIZE': '1',
    'POLICY_MODEL_GPUS': '0,1,2,3',
    'SAFETY_MODEL_GPUS': '4,5'
})

print("Starting policy model server...")
policy_server = subprocess.Popen([
    'python3', '-m', 'vllm.entrypoints.openai.api_server',
    '--model', MODEL_NAME_OR_PATH,
    '--trust-remote-code',
    '--seed', '1',
    '--host', os.environ['VLLM_HOST'],
    '--port', '5000',
    '--served-model-name', 'test-model',
    '--enable-reasoning', 
    '--reasoning-parser', 'qwen3',
    '--tensor-parallel-size', os.environ['VLLM_TENSOR_PARALLEL_SIZE'],
    '--download-dir', os.environ['HF_HOME']
], env={**os.environ, 'CUDA_VISIBLE_DEVICES': os.environ['POLICY_MODEL_GPUS']},
   stdout=open(f"{LOG_DIR}/vllm-server-model.log", 'w'),
   stderr=subprocess.STDOUT)

!sleep 120

Starting policy model server...


In [9]:
# subprocess.run(['pkill', '-f', 'vllm.entrypoints.openai.api_server'])

## Content Safety - Aegis v2 [TODO]

Launch a vLLM server for the model

### Use `safety-eval` to run Aegis v2 evaluation

Use `safety-eval` to run Aegis v2 evaluation. It will take 10 minutes if you launch the vLLM server with 8x H100 GPUs.

Check the log file `$LOG_DIR/safety-eval-aegis-v2.log` for progress.

In [5]:
!safety-eval --model-name "test-model" \
             --model-url http://localhost:5000/v1 \
             --judge-url https://b319a99a-5241-4459-b641-7219ad0fd86d.invocation.api.nvcf.nvidia.com/v1   \
             --results-dir {AEGIS_V2_RESULTS_DIR} \
             --concurrency 64 \
             --eval aegis_v2 \
             --inference_params "temperature=0.6,top_p=0.95,max_completion_tokens=6000" &> "$LOG_DIR/safety-eval-aegis-v2.log"

## Content Safety - WildGuard

For WildGuard evaluation, a gated dataset `allenai/wildguardmix` hosted on the HF Dataset Hub will be used. Visit the dataset page at https://huggingface.co/datasets/allenai/wildguardmix to ask for access.

Make sure to use the HF token associated with the account with the access.

Use `safety-eval` to run WildGuard evaluation. It will take about 15 minutes if you launch the vLLM server with 8x H100 GPUs.

Check the log file `$LOG_DIR/safety-eval-wildguard.log` for progress.

In [8]:
!safety-eval --model-name "test-model" \
             --model-url http://localhost:5000/v1 \
             --judge-url https://d27e7649-daf1-46c2-ba22-49374402c31d.invocation.api.nvcf.nvidia.com/v1 \
             --results-dir {WILDGUARD_RESULTS_DIR} \
             --concurrency 64 \
             --eval wildguard \
             --inference_params "temperature=0.6,top_p=0.95,max_completion_tokens=12000" &> "$LOG_DIR/safety-eval-wildguard.log"

### Results

TO write something here

In [15]:
wildguard_results = json.load(open(f"{WILDGUARD_RESULTS_DIR}/metrics.json"))
wildguard_safe_score = wildguard_results["safe"] / (wildguard_results["safe"] + wildguard_results["unsafe"])
print(f"WildGuard: {wildguard_safe_score}")

WildGuard: 0.8964514252472368


## Product Security - Garak

In [None]:
TIER1_PROBES = [
    "ansiescape.AnsiRaw",
    "dan.DanInTheWild",
    "encoding.InjectBase64",
    "encoding.InjectUU",
    "exploitation.JinjaTemplatePythonInjection",
    "exploitation.SQLInjectionEcho",
    "goodside.Tag",
    "grandma.Slurs",
    "grandma.Substances",
    "latentinjection.LatentInjectionFactSnippetEiffel",
    "latentinjection.LatentInjectionFactSnippetLegal",
    "latentinjection.LatentInjectionReport",
    "latentinjection.LatentInjectionResume",
    "latentinjection.LatentInjectionTranslationEnFr",
    "latentinjection.LatentInjectionTranslationEnZh",
    "latentinjection.LatentJailbreak",
    "latentinjection.LatentWhois",
    "latentinjection.LatentWhoisSnippet",
    "leakreplay.GuardianComplete",
    "leakreplay.LiteratureComplete",
    "leakreplay.NYTComplete",
    "leakreplay.PotterComplete",
    "packagehallucination.JavaScript",
    "packagehallucination.Python",
    "packagehallucination.Ruby",
    "packagehallucination.Rust",
    "realtoxicityprompts.RTPBlank",
    "xss.ColabAIDataLeakage",
    "xss.MarkdownImageExfil",
    "xss.MdExfil20230929",
    "xss.StringAssemblyDataExfil"
]

TIER2_PROBES = [
    "atkgen.Tox",
    "continuation.ContinueSlursReclaimedSlurs",
    "dan.Ablation_Dan_11_0",
    "dan.AutoDANCached",
    "divergence.Repeat",
    "encoding.InjectAscii85",
    "encoding.InjectBase16",
    "encoding.InjectBase2048",
    "encoding.InjectBase32",
    "encoding.InjectBraille",
    "encoding.InjectEcoji",
    "encoding.InjectHex",
    "encoding.InjectMorse",
    "encoding.InjectNato",
    "encoding.InjectROT13",
    "encoding.InjectZalgo",
    "goodside.ThreatenJSON",
    "goodside.WhoIsRiley",
    "grandma.Win10",    
    "grandma.Win11",
    "leakreplay.GuardianCloze",
    "leakreplay.LiteratureCloze",
    "leakreplay.NYTCloze",
    "leakreplay.PotterCloze",
    "lmrc.Bullying",
    "lmrc.Deadnaming",
    "lmrc.QuackMedicine",
    "lmrc.SexualContent",
    "lmrc.Sexualisation",
    "lmrc.SlurUsage",
    "malwaregen.Evasion",
    "malwaregen.Payload",
    "malwaregen.SubFunctions",
    "malwaregen.TopLevel",
    "misleading.FalseAssertion",
    "phrasing.FutureTense",
    "phrasing.PastTense",
    "promptinject.HijackHateHumans",
    "promptinject.HijackKillHumans",
    "promptinject.HijackLongPrompt",
    "snowball.GraphConnectivity",
    "suffix.GCGCached",
    "tap.TAPCached",
    "topic.WordnetControversial",

In [32]:
def run_garak_probes(target_probes: List[str],
                     report_dir: str,
                     conf_dir: str,
                     log_dir: str,
                     max_workers: int = 4):
    """Run Garak probes in parallel using ThreadPoolExecutor"""
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_probe = {
            executor.submit(run_single_probe, probe, report_dir, conf_dir, log_dir): probe 
            for probe in target_probes
        }
        
        for future in tqdm(as_completed(future_to_probe), total=len(target_probes), desc="Running probes"):
            probe = future_to_probe[future]
            try:
                result = future.result()
                print(f"Completed probe: {probe}")
            except Exception as e:
                print(f"Error running probe {probe}: {str(e)}")
                
def run_single_probe(probe: str,
                     report_dir: str,
                     conf_dir:str,
                     log_dir:str):
    """Run a single Garak probe"""
    report_path = os.path.join(report_dir, probe)
    if not os.path.exists(report_path):
        os.makedirs(report_path)
    conf_path = os.path.join(conf_dir, f"{probe}.yaml")
    log_path = os.path.join(log_dir, f"{probe}.log")
    
    env = os.environ.copy()
    env["XDG_DATA_HOME"] = report_path
    env["XDG_DATA"] = report_path
    
    # if "PYTHONPATH" in env:
    #     env["PYTHONPATH"] = f"{GARAK_DIR}:{env['PYTHONPATH']}"
    # else:
    #     env["PYTHONPATH"] = GARAK_DIR
    
    #garak_cmd = ["garak", "--config", conf_path]
    garak_cmd = ["garak", "--config", conf_path]
    
    with open(log_path, 'w') as f:
        subprocess.run(
            garak_cmd,
            env=env,
            stdout=f,
            stderr=f,
            check=True
        )

In [33]:
# TODO: Copy to the top
import yaml
import copy

NUM_GENERATIONS = 1
TARGET_PROBES = ["dan.DanInTheWild", "grandma.Slurs", "grandma.Substances"]
# * Uncomment the line below if you'd like to run the full Garak evaluation
# TARGET_PROBES = TIER1_PROBES + TIER2_PROBES

# Create config files based on the base config
with open("garak_base_config.yaml") as fin:
    garak_base_config = yaml.safe_load(fin)

for target_probe in TARGET_PROBES:
    garak_config = copy.deepcopy(garak_base_config)
    garak_config["run"]["generations"] = NUM_GENERATIONS
    garak_config["plugins"]["probe_spec"] = target_probe
    new_config_filepath = os.path.join(GARAK_CONFIG_DIR, f"{target_probe}.yaml")
    with open(new_config_filepath, "w") as fout:
        yaml.dump(garak_config, fout, sort_keys=False)

# Run Garak evaluation
run_garak_probes(target_probes=TARGET_PROBES, report_dir=GARAK_REPORT_DIR, conf_dir=GARAK_CONFIG_DIR, log_dir=GARAK_LOG_DIR, max_workers=4)

Running probes:  33%|█████████████████████████████████████████████████████▎                                                                                                          | 1/3 [00:09<00:19,  9.87s/it]

Completed probe: grandma.Slurs


Running probes:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 2/3 [00:12<00:05,  5.39s/it]

Completed probe: grandma.Substances


Running probes: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:26<00:00,  8.86s/it]

Completed probe: dan.DanInTheWild





# TODO: Aggregating Garak results

## Accuracy - GPQA-D, MATH-500, AIME2024, IFEval

## GPQA-D

Explanation of GPQA-D

30 minutes

In [16]:
!simple_evals --model 'test-model' \
              --url http://localhost:5000/v1/chat/completions \
              --eval_name gpqa_diamond \
              --temperature 0.6 \
              --top_p 0.95 \
              --max_tokens 8192 \
              --num_threads 4 \
              --max_retries 5 \
              --timeout 150 \
              --out_dir {GPQA_DIAMOND_RESULTS_DIR} \
              --cache_dir {GPQA_DIAMOND_RESULTS_DIR} &> "$LOG_DIR/simple-evals-gpqa_diamond.log"

In [11]:
# subprocess.run(
#     [
#         "simple_evals",
#         "--model", 'test-model',
#         "--url", "http://localhost:5000/v1/chat/completions",
#         "--eval_name", "gpqa_diamond",
#         "--temperature", "0.6",
#         "--top_p", "0.95",
#         "--max_tokens", "8192",
#         "--out_dir", f"results/baseline-evals/gpqa-diamond",
#         "--cache_dir", f"results/baseline-evals/gpqa-diamond",
#         "--num_threads", "4",
#         "--max_retries", "5",
#         "--timeout", "150"
#     ],
#     stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-gpqa-diamond.log", "w"),
#     stderr=subprocess.STDOUT,
#     start_new_session=True)

CompletedProcess(args=['simple_evals', '--model', 'test-model', '--url', 'http://localhost:5000/v1/chat/completions', '--eval_name', 'gpqa_diamond', '--temperature', '0.6', '--top_p', '0.95', '--max_tokens', '8192', '--out_dir', 'results/baseline-evals/gpqa-diamond', '--cache_dir', 'results/baseline-evals/gpqa-diamond', '--num_threads', '4', '--max_retries', '5', '--timeout', '150'], returncode=0)

### Math-500

Explanation of Math-500

20 minutes

In [17]:
!simple_evals --model 'test-model' \
              --url http://localhost:5000/v1/chat/completions \
              --eval_name AA_math_test_500 \
              --temperature 0.6 \
              --top_p 0.95 \
              --max_tokens 8192 \
              --num_threads 4 \
              --max_retries 5 \
              --timeout 150 \
              --out_dir {AA_MATH_500_RESULTS_DIR} \
              --cache_dir {AA_MATH_500_RESULTS_DIR} &> "$LOG_DIR/simple-evals-aa-math-500.log"

In [14]:
# subprocess.run(
#     [
#         "simple_evals",
#         "--model", 'test-model',
#         "--url", "http://localhost:5000/v1/chat/completions",
#         "--eval_name", "AA_math_test_500",
#         "--temperature", "0.6",
#         "--top_p", "0.95",
#         "--max_tokens", "8192",
#         "--out_dir", f"results/baseline-evals/aa-math-500",
#         "--cache_dir", f"results/baseline-evals/aa-math-500",
#         "--num_threads", "4",
#         "--max_retries", "5",
#         "--timeout", "150"
#     ],
#     stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-aa-math-500.log", "w"),
#     stderr=subprocess.STDOUT,
#     start_new_session=True)

CompletedProcess(args=['simple_evals', '--model', 'test-model', '--url', 'http://localhost:5000/v1/chat/completions', '--eval_name', 'AA_math_test_500', '--temperature', '0.6', '--top_p', '0.95', '--max_tokens', '8192', '--out_dir', 'results/baseline-evals/aa-math-500', '--cache_dir', 'results/baseline-evals/aa-math-500', '--num_threads', '4', '--max_retries', '5', '--timeout', '150'], returncode=1)

### IFEval

Explaantion of IFEval

xx minutes

In [11]:
!lm-eval --model local-chat-completions \
         --tasks ifeval \
         --model_args "base_url=http://localhost:5000/v1/chat/completions,model=test-model,tokenized_requests=false,num_concurrent=4,max_gen_toks=8192,timeout=150,max_retries=5,stream=False" \
         --log_samples \
         --fewshot_as_multiturn \
         --num_fewshot 0 \
         --apply_chat_template \
         --gen_kwargs "temperature=0.6,top_p=0.95" \
         --output_path {IFEVAL_RESULTS_DIR} \
         --use_cache {IFEVAL_RESULTS_DIR} &> "$LOG_DIR/lm-eval-ifeval.log"

In [15]:
# process = subprocess.Popen(
#     [
#         "lm-eval",
#         "--tasks", "ifeval",
#         "--num_fewshot", "0",
#         "--model", "local-chat-completions",
#         "--model_args", "base_url=http://localhost:5000/v1/chat/completions,model=test-model,tokenized_requests=false,num_concurrent=4,max_gen_toks=8192,timeout=150,max_retries=5,stream=False",
#         "--log_samples",
#         "--output_path", f"results/baseline-evals/ifeval",
#         "--use_cache", f"results/baseline-evals/ifeval",
#         "--fewshot_as_multiturn",
#         "--apply_chat_template",
#         "--gen_kwargs", "temperature=0.6,top_p=0.95"
#     ],
#     stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-ifeval.log", "w"),
#     stderr=subprocess.STDOUT,
#     start_new_session=True
# )

## Aggregate the results and Create a report card

Now that we ran three safety evaluation benchmarks, Aegis v2 and WildGuard for content safety and Garak for product security along with a set of commonly used accuracy benchmarks. 
We'll collect these results and understand how good/bad the model is with respect to safety---content safety and product security.
