# Step 1 Evaluation

In [2]:
import concurrent.futures
import json
import os
import subprocess
import time
from pathlib import Path
import shutil

import openai
from tqdm import tqdm

In [3]:
# 7. Run Baseline Evaluation - Run Evaluations in Background
BASE_DIR = "/workspace/"
LOG_DIR = "/workspace/logs/"
DATASET_DIR = "/workspace/dataset/"
MODEL_NAME_OR_PATH = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
MODEL_TAG_NAME = MODEL_NAME_OR_PATH.split("/")[-1]

# * Dataset
AEGIS_V2_TEST_DIR = f"{DATASET_DIR}/aegis_v2"

# * Content Safety benchmark
CONTENT_SAFETY_RESULTS_DIR = f"{BASE_DIR}/results/{MODEL_TAG_NAME}/content-safety-evals"
AEGIS_V2_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/aegis_v2"
WILDGUARD_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/wildguard"

# * Security benchmark
SECURITY_RESULTS_DIR = f"{BASE_DIR}/results/{MODEL_TAG_NAME}/security-evals"
GARAK_RESULTS_DIR = f"{SECURITY_RESULTS_DIR}/garak"

# * Accuracy benchmark
ACCURACY_RESULTS_DIR = f"{BASE_DIR}/results/{MODEL_TAG_NAME}/accuracy-evals"
GPQA_DIAMOND_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/gpqa-diamond"
AA_MATH_500_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/aa-math-500"
IFEVAL_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/ifeval"

In [None]:
%%bash

mkdir -p AEGIS_V2_TEST_DIR

mkdir -p AEGIS_V2_RESULTS_DIR
mkdir -p WILDGUARD_RESULTS_DIR
mkdir -p GARAK_RESULTS_DIR
mkdir -p GPQA_DIAMOND_RESULTS_DIR
mkdir -p AA_MATH_500_RESULTS_DIR
mkdir -p IFEVAL_RESULTS_DIR

In [5]:
# Credentials
os.environ.update({
    "MY_API_KEY":"empty",
    "JUDGE_API_KEY": "nvapi-XXXXXXXXXXXXXX",
    "HF_TOKEN":"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"
})

os.environ.update({
    'BASE_DIR': f"{BASE_DIR}",
    'TMPDIR': f"{BASE_DIR}/tmp",
    'XDG_CACHE_HOME': f"{BASE_DIR}/cache",
    'HF_HOME': f"{BASE_DIR}/cache/huggingface",
    'UV_CACHE_DIR': f"{BASE_DIR}/cache/uv",
    'TRITON_CACHE_DIR': f"{BASE_DIR}/cache/triton",
    'DATASET_CACHE_DIR': f"{BASE_DIR}/dataset_cache",
    'RAY_TMPDIR': "/tmp/ray",
    'LOG_DIR': f"{LOG_DIR}"
})

## Content Safety - Aegis v2

Launch a vLLM server for the model

In [10]:
# VLLM Host 
os.environ.update({
    'VLLM_ENGINE_ITERATION_TIMEOUT_S': '36000',
    'VLLM_ALLOW_LONG_MAX_MODEL_LEN': '1',
    'VLLM_HOST': '0.0.0.0',
    'VLLM_TENSOR_PARALLEL_SIZE': '1',
    'POLICY_MODEL_GPUS': '0,1,2,3',
    'SAFETY_MODEL_GPUS': '4,5'
})

print("Starting policy model server...")
policy_server = subprocess.Popen([
    'python3', '-m', 'vllm.entrypoints.openai.api_server',
    '--model', MODEL_NAME_OR_PATH,
    '--trust-remote-code',
    '--seed', '1',
    '--host', os.environ['VLLM_HOST'],
    '--port', '5000',
    '--served-model-name', 'test-model',
    '--enable-reasoning', 
    '--reasoning-parser', 'qwen3',
    '--tensor-parallel-size', os.environ['VLLM_TENSOR_PARALLEL_SIZE'],
    '--download-dir', os.environ['HF_HOME']
], env={**os.environ, 'CUDA_VISIBLE_DEVICES': os.environ['POLICY_MODEL_GPUS']},
   stdout=open(f"{LOG_DIR}/vllm-server-model.log", 'w'),
   stderr=subprocess.STDOUT)

Starting policy model server...


In [9]:
subprocess.run(['pkill', '-f', 'vllm.entrypoints.openai.api_server'])

CompletedProcess(args=['pkill', '-f', 'vllm.entrypoints.openai.api_server'], returncode=0)

In [None]:
%%bash

# Download the file
huggingface-cli download nvidia/Aegis-AI-Content-Safety-Dataset-2.0 test.json --repo-type dataset
mv test.json ${AEGIS_V2_TEST_DIR}/

In [None]:
class AegisV2Evaluator:
    def __init__(self, model_name,
                 port: int = 5000,
                 ):
        self.client = openai.OpenAI(
            base_url=f"http://localhost:{port}/v1",
            api_key="EMPTY"
        )
        self.model_name = model_name
        self.port = port
        self.output_key = output_key

    def query_server(self, client, messages, model_name, temperature, top_p, max_tokens, number_of_generations=1):
        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stream=False,
            n=number_of_generations
        )
        if number_of_generations == 1:
            return response.choices[0].message.content
        else:
            return [choice.message.content for choice in response.choices]

    def query_single(self, data, idx, temperature=0.6, top_p=0.95, tokens_to_generate=8192, number_of_generations=1):
        """Function to wrap openai client code for a single data item"""
        messages = [{"role": "user", "content": data['prompt']}]
        try:
            result = self.query_server(self.client, messages, self.model_name,
                                       temperature,
                                       top_p,
                                       tokens_to_generate,
                                       number_of_generations)
            return result, idx
        except Exception as e:
            return f"Error: {e}"

    def query_batch(self,
                    data_list,
                    num_workers: int = 256,
                    temperature: float = 0.6,
                    top_p: float = 0.95,
                    tokens_to_generate: int = 8192,
                    number_of_generations: int = 1):
        """Add progress bar for query_batch"""
        results = [None for _ in range(len(data_list))]
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
            futures = [executor.submit(self.query_single, data, idx, temperature, top_p, tokens_to_generate, number_of_generations) for idx, data in enumerate(data_list)]
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Querying"):
                pred, idx = future.result()
                results[idx] = pred
        return results
    
    def run_evaluation(self,
                       num_workers: int = 256,
                       temperature: float = 0.6,
                       top_p: float = 0.95,
                       tokens_to_generate: int = 8192,
                       number_of_generations: int = 1,
                       test_filepath: str = None,
                       pred_filepath: str = None,
                       output_key: str = "generated_response"):
        """Run evaluation for Aegis v2"""
        test_examples = [json.loads(line) for line in open(test_filepath)]
        predictions = self.query_batch(test_examples,
                                       num_workers=num_workers,
                                       temperature=temperature,
                                       top_p=top_p,
                                       tokens_to_generate=tokens_to_generate,
                                       number_of_generations=number_of_generations)
        results = []
        for test, pred in zip(test_examples, predictions):
            if isinstance(pred, list):
                for p in pred:
                    results.append({output_key: p, **test})
            else:
                test[output_key] = pred
                results.append(test)

        with open(pred_filepath, 'w') as f:
            for d in results:
                f.write(json.dumps(d) + '\n')

```
# * Aegis v2
$ safety-eval  --model-name  "deepseek-ai/deepseek-r1-distill-llama-8b" \
               --model-url http://localhost:8000/v1 \
               --judge-url  https://b319a99a-5241-4459-b641-7219ad0fd86d.invocation.api.nvcf.nvidia.com/v1  \
               --results-dir results/aegis \
               --concurrency 64  \
               --eval aegis_v2 \
               --inference_params temperature=0.6,top_p=0.95,max_completion_tokens=12000 --limit 1
```

In [None]:
aegis_v2_evaluator = AegisV2Evaluator(model_name="test-model", port=5000)
aegis_v2_evaluator.run_evaluation(test_filepath=f"{DATASET_DIR}/aegis_v2/test.jsonl",
                                  pred_filepath=f"{AEGIS_V2_RESULTS_DIR}/pred.jsonl")

## Product Security - Garak

## Accuracy - GPQA-D, MATH-500, AIME2024, IFEval

In [11]:
subprocess.run(
    [
        "simple_evals",
        "--model", 'test-model',
        "--url", "http://localhost:5000/v1/chat/completions",
        "--eval_name", "gpqa_diamond",
        "--temperature", "0.6",
        "--top_p", "0.95",
        "--max_tokens", "8192",
        "--out_dir", f"results/baseline-evals/gpqa-diamond",
        "--cache_dir", f"results/baseline-evals/gpqa-diamond",
        "--num_threads", "4",
        "--max_retries", "5",
        "--timeout", "150"
    ],
    stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-gpqa-diamond.log", "w"),
    stderr=subprocess.STDOUT,
    start_new_session=True)

CompletedProcess(args=['simple_evals', '--model', 'test-model', '--url', 'http://localhost:5000/v1/chat/completions', '--eval_name', 'gpqa_diamond', '--temperature', '0.6', '--top_p', '0.95', '--max_tokens', '8192', '--out_dir', 'results/baseline-evals/gpqa-diamond', '--cache_dir', 'results/baseline-evals/gpqa-diamond', '--num_threads', '4', '--max_retries', '5', '--timeout', '150'], returncode=0)

In [14]:
subprocess.run(
    [
        "simple_evals",
        "--model", 'test-model',
        "--url", "http://localhost:5000/v1/chat/completions",
        "--eval_name", "AA_math_test_500",
        "--temperature", "0.6",
        "--top_p", "0.95",
        "--max_tokens", "8192",
        "--out_dir", f"results/baseline-evals/aa-math-500",
        "--cache_dir", f"results/baseline-evals/aa-math-500",
        "--num_threads", "4",
        "--max_retries", "5",
        "--timeout", "150"
    ],
    stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-aa-math-500.log", "w"),
    stderr=subprocess.STDOUT,
    start_new_session=True)

CompletedProcess(args=['simple_evals', '--model', 'test-model', '--url', 'http://localhost:5000/v1/chat/completions', '--eval_name', 'AA_math_test_500', '--temperature', '0.6', '--top_p', '0.95', '--max_tokens', '8192', '--out_dir', 'results/baseline-evals/aa-math-500', '--cache_dir', 'results/baseline-evals/aa-math-500', '--num_threads', '4', '--max_retries', '5', '--timeout', '150'], returncode=1)

In [15]:
process = subprocess.Popen(
    [
        "lm-eval",
        "--tasks", "ifeval",
        "--num_fewshot", "0",
        "--model", "local-chat-completions",
        "--model_args", "base_url=http://localhost:5000/v1/chat/completions,model=test-model,tokenized_requests=false,num_concurrent=4,max_gen_toks=8192,timeout=150,max_retries=5,stream=False",
        "--log_samples",
        "--output_path", f"results/baseline-evals/ifeval",
        "--use_cache", f"results/baseline-evals/ifeval",
        "--fewshot_as_multiturn",
        "--apply_chat_template",
        "--gen_kwargs", "temperature=0.6,top_p=0.95"
    ],
    stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-ifeval.log", "w"),
    stderr=subprocess.STDOUT,
    start_new_session=True
)