# Notebook 3: Safety and Accuracy Evaluation for the Safety Post-trained Model

In this notebook, we will rerun the same set of evaluations for the model and see how the safety scores on **Content Safety** and **Product Security** have improved. 

TODO: Will be based on Notebook1

import os
import subprocess
import time
from pathlib import Path
import shutil

In [None]:
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
import copy
import json
import os
import signal
import subprocess
import sys
import time
from typing import List
from pathlib import Path
import shutil
import yaml

import openai
import pandas as pd
pd.set_option('display.max_rows', 20)     # Show all rows
pd.set_option('display.max_colwidth', 5) # Show all columns
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

In [3]:
# * Original model directories
BASE_DIR = "./workspace/"
DATASET_DIR = f"{BASE_DIR}/dataset/"
MODEL_NAME_OR_PATH = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
MODEL_TAG_NAME = MODEL_NAME_OR_PATH.split("/")[-1]
MODEL_OUTPUT_DIR = f"{BASE_DIR}/results/{MODEL_TAG_NAME}/"
LOG_DIR = f"{MODEL_OUTPUT_DIR}/logs/"

# * Dataset
NEMOGURD_MODEL_PATH = f"{BASE_DIR}/model/llama-3.1-nemoguard-8b-content-safety"
AEGIS_V2_TEST_DIR = f"{DATASET_DIR}/aegis_v2"

# * Content Safety benchmark
CONTENT_SAFETY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/content-safety-evals"
AEGIS_V2_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/aegis_v2"
WILDGUARD_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/wildguard"

# * Security benchmark
SECURITY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/security-evals"
GARAK_RESULTS_DIR = f"{SECURITY_RESULTS_DIR}/garak"
GARAK_CONFIG_DIR = f"{GARAK_RESULTS_DIR}/configs"
GARAK_LOG_DIR = f"{GARAK_RESULTS_DIR}/logs"
GARAK_REPORT_DIR = f"{GARAK_RESULTS_DIR}/reports"

# * Accuracy benchmark
ACCURACY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/accuracy-evals"
GPQA_DIAMOND_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/gpqa-diamond"
AA_MATH_500_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/aa-math-500"
IFEVAL_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/ifeval"

In [None]:
# * Safety trained model directories
BASE_DIR = "./workspace/training"
DATASET_DIR = f"{BASE_DIR}/dataset/"
MODEL_NAME_OR_PATH = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
MODEL_TAG_NAME = MODEL_NAME_OR_PATH.split("/")[-1]
MODEL_OUTPUT_DIR = f"{BASE_DIR}/results/{MODEL_TAG_NAME}/"
LOG_DIR = f"{MODEL_OUTPUT_DIR}/logs/"

# * Dataset
NEMOGURD_MODEL_PATH = f"{BASE_DIR}/model/llama-3.1-nemoguard-8b-content-safety"
AEGIS_V2_TEST_DIR = f"{DATASET_DIR}/aegis_v2"

# * Content Safety benchmark
CONTENT_SAFETY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/content-safety-evals"
AEGIS_V2_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/aegis_v2"
WILDGUARD_RESULTS_DIR = f"{CONTENT_SAFETY_RESULTS_DIR}/wildguard"

# * Security benchmark
SECURITY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/security-evals"
GARAK_RESULTS_DIR = f"{SECURITY_RESULTS_DIR}/garak"
GARAK_CONFIG_DIR = f"{GARAK_RESULTS_DIR}/configs"
GARAK_LOG_DIR = f"{GARAK_RESULTS_DIR}/logs"
GARAK_REPORT_DIR = f"{GARAK_RESULTS_DIR}/reports"

# * Accuracy benchmark
ACCURACY_RESULTS_DIR = f"{MODEL_OUTPUT_DIR}/accuracy-evals"
GPQA_DIAMOND_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/gpqa-diamond"
AA_MATH_500_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/aa-math-500"
IFEVAL_RESULTS_DIR = f"{ACCURACY_RESULTS_DIR}/ifeval"

In [5]:
# Credentials
os.environ.update({
    "MY_API_KEY":"empty",
    "JUDGE_API_KEY": "nvapi-XXXXXXXXXXXXXX",
    "HF_TOKEN":"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"
})

os.environ.update({
    'BASE_DIR': f"{BASE_DIR}",
    'TMPDIR': f"{BASE_DIR}/tmp",
    'XDG_CACHE_HOME': f"{BASE_DIR}/cache",
    'HF_HOME': f"{BASE_DIR}/cache/huggingface",
    'UV_CACHE_DIR': f"{BASE_DIR}/cache/uv",
    'TRITON_CACHE_DIR': f"{BASE_DIR}/cache/triton",
    'DATASET_CACHE_DIR': f"{BASE_DIR}/dataset_cache",
    'RAY_TMPDIR': "/tmp/ray_ahazare",
    'LOG_DIR': f"{LOG_DIR}"
})

In [6]:
%%bash

# MAKE DIRECTORIES FOR EACH EVALUATION
mkdir -p "results/baseline-evals/gpqa-diamond"
mkdir -p "results/baseline-evals/aa-math-500"
mkdir -p "results/baseline-evals/ifeval"

In [7]:
SFT_CKPT_PATH="/lustre/fsw/portfolios/llmservice/users/ahazare/NeMo-RL/results/sft_deepseek_8b_trial_step_300/step_50/hf_ckpt"

In [10]:
# VLLM Host 
os.environ.update({
    'VLLM_ENGINE_ITERATION_TIMEOUT_S': '36000',
    'VLLM_ALLOW_LONG_MAX_MODEL_LEN': '1',
    'VLLM_HOST': '0.0.0.0',
    'VLLM_TENSOR_PARALLEL_SIZE': '1',
    'POLICY_MODEL_GPUS': '0,1,2,3',
    'SAFETY_MODEL_GPUS': '4,5'
})

print("Starting policy model server...")
policy_server = subprocess.Popen([
    'python3', '-m', 'vllm.entrypoints.openai.api_server',
    '--model', SFT_CKPT_PATH,
    '--trust-remote-code',
    '--seed', '1',
    '--host', os.environ['VLLM_HOST'],
    '--port', '5000',
    '--served-model-name', 'test-model',
    '--enable-reasoning', 
    '--reasoning-parser', 'qwen3',
    '--tensor-parallel-size', os.environ['VLLM_TENSOR_PARALLEL_SIZE'],
    '--download-dir', os.environ['HF_HOME']
], env={**os.environ, 'CUDA_VISIBLE_DEVICES': os.environ['POLICY_MODEL_GPUS']},
   stdout=open(f"{LOG_DIR}/vllm-server-model.log", 'w'),
   stderr=subprocess.STDOUT)

Starting policy model server...


In [9]:
subprocess.run(['pkill', '-f', 'vllm.entrypoints.openai.api_server'])

CompletedProcess(args=['pkill', '-f', 'vllm.entrypoints.openai.api_server'], returncode=0)

In [11]:
subprocess.run(
    [
        "simple_evals",
        "--model", 'test-model',
        "--url", "http://localhost:5000/v1/chat/completions",
        "--eval_name", "gpqa_diamond",
        "--temperature", "0.6",
        "--top_p", "0.95",
        "--max_tokens", "8192",
        "--out_dir", f"results/baseline-evals/gpqa-diamond",
        "--cache_dir", f"results/baseline-evals/gpqa-diamond",
        "--num_threads", "4",
        "--max_retries", "5",
        "--timeout", "150"
    ],
    stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-gpqa-diamond.log", "w"),
    stderr=subprocess.STDOUT,
    start_new_session=True)

CompletedProcess(args=['simple_evals', '--model', 'test-model', '--url', 'http://localhost:5000/v1/chat/completions', '--eval_name', 'gpqa_diamond', '--temperature', '0.6', '--top_p', '0.95', '--max_tokens', '8192', '--out_dir', 'results/baseline-evals/gpqa-diamond', '--cache_dir', 'results/baseline-evals/gpqa-diamond', '--num_threads', '4', '--max_retries', '5', '--timeout', '150'], returncode=0)

In [14]:
subprocess.run(
    [
        "simple_evals",
        "--model", 'test-model',
        "--url", "http://localhost:5000/v1/chat/completions",
        "--eval_name", "AA_math_test_500",
        "--temperature", "0.6",
        "--top_p", "0.95",
        "--max_tokens", "8192",
        "--out_dir", f"results/baseline-evals/aa-math-500",
        "--cache_dir", f"results/baseline-evals/aa-math-500",
        "--num_threads", "4",
        "--max_retries", "5",
        "--timeout", "150"
    ],
    stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-aa-math-500.log", "w"),
    stderr=subprocess.STDOUT,
    start_new_session=True)

CompletedProcess(args=['simple_evals', '--model', 'test-model', '--url', 'http://localhost:5000/v1/chat/completions', '--eval_name', 'AA_math_test_500', '--temperature', '0.6', '--top_p', '0.95', '--max_tokens', '8192', '--out_dir', 'results/baseline-evals/aa-math-500', '--cache_dir', 'results/baseline-evals/aa-math-500', '--num_threads', '4', '--max_retries', '5', '--timeout', '150'], returncode=1)

In [15]:
process = subprocess.Popen(
    [
        "lm-eval",
        "--tasks", "ifeval",
        "--num_fewshot", "0",
        "--model", "local-chat-completions",
        "--model_args", "base_url=http://localhost:5000/v1/chat/completions,model=test-model,tokenized_requests=false,num_concurrent=4,max_gen_toks=8192,timeout=150,max_retries=5,stream=False",
        "--log_samples",
        "--output_path", f"results/baseline-evals/ifeval",
        "--use_cache", f"results/baseline-evals/ifeval",
        "--fewshot_as_multiturn",
        "--apply_chat_template",
        "--gen_kwargs", "temperature=0.6,top_p=0.95"
    ],
    stdout=open(f"{os.getenv('LOG_DIR')}/baseline-eval-ifeval.log", "w"),
    stderr=subprocess.STDOUT,
    start_new_session=True
)