In [3]:
import json
import os
import concurrent.futures
import openai
from tqdm import tqdm

# Setting inferene Parameter

In [11]:
temperature = 0.6
top_p = 0.95
tokens_to_generate = 12000

port = 8000
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"


# Launch VLLM Server

In [None]:
%%bash
MODEL_GPUS=1
MODEL_NAME="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
TENSOR_PARALLEL_SIZE=1
DATA_PARALLEL_SIZE=1
CUDA_VISIBLE_DEVICES=$MODEL_GPUS python3 -m vllm.entrypoints.openai.api_server \
  --model "$MODEL_NAME" \
  --reasoning-parser "qwen3" \
  --trust-remote-code \
  --seed 1 \
  --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
  --data-parallel-size "$DATA_PARALLEL_SIZE" &> "/workspace/vllm-server-model.log" &

# Functions for Querying VLLM Server

In [12]:
client = openai.OpenAI(
    base_url=f"http://localhost:{port}/v1",
    api_key="EMPTY"
)
number_of_generations = 1

def query_server(client, messages, model_name, temperature, top_p, max_tokens, number_of_generations=1):
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=False,
        n=number_of_generations
    )
    if number_of_generations == 1:
        return response.choices[0].message.content
    else:
        return [choice.message.content for choice in response.choices]


# Function to wrap openai client code for a single data item
def query_single(data, idx):
    messages = [{"role": "user", "content": data['prompt']}]
    try:
        result = query_server(client, messages, model_name, temperature, top_p, tokens_to_generate, number_of_generations)
        return result, idx
    except Exception as e:
        return f"Error: {e}"


# add progress bar for query_batch
def query_batch(data_list, num_workers=32):
    results = [None for _ in range(len(data_list))]
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(query_single, data, idx) for idx, data in enumerate(data_list)]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Querying"):
            pred, idx = future.result()
            results[idx] = pred
    return results




In [13]:
def load_jsonl_data(data_path):
    data = []
    with open(data_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data




In [14]:

# Set the number of workers (adjust as needed)
num_workers = 256

input_file = '/home/zijiac/datasets/aegis_2_test.jsonl'
output_key = 'generated_response'
pred_output_file = '/home/zijiac/projects/NeMo-Safety/notebooks/aegis_v2_predictions.jsonl'
eval_output_file = '/home/zijiac/projects/NeMo-Safety/notebooks/aegis_v2_eval.jsonl'

data = load_jsonl_data(input_file)

predictions = query_batch(data, num_workers)

results = []

for t,pred in zip(data, predictions):
    if isinstance(pred, list):
        for p in pred:
            results.append({output_key: p, **t})
    else:
        t[output_key] = pred
        results.append(t)


with open(pred_output_file, 'w') as f:
    for d in results:
        f.write(json.dumps(d) + '\n')

Querying: 100%|██████████| 1928/1928 [11:42<00:00,  2.75it/s]
