In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
teacher_harmful_id = "Orenguteng/Llama-3-8B-Lexi-Uncensored"
teacher_harmful_tokenizer = AutoTokenizer.from_pretrained(teacher_harmful_id)
teacher_harmful_model = AutoModelForCausalLM.from_pretrained(
    teacher_harmful_id,
    torch_dtype=torch.bfloat16,
    device_map = 0
)

2025-08-23 00:16:03.289358: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-23 00:16:03.300479: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755888363.313011 4135678 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755888363.316532 4135678 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755888363.326549 4135678 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:

# Benign Teacher
teacher_benign_id = "meta-llama/Meta-Llama-3-8B-Instruct"
teacher_benign_tokenizer = AutoTokenizer.from_pretrained(teacher_benign_id)
teacher_benign_model = AutoModelForCausalLM.from_pretrained(
    teacher_benign_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:

models_to_evaluate = { 
    "teacher_harmful": (teacher_harmful_model, teacher_harmful_tokenizer),
    # "teacher_benign": (teacher_benign_model, teacher_benign_tokenizer),
}


In [3]:

# --- 2. Load your dataset ---
# Assuming your dataset is a CSV with 'prompt' and 'harm_level' columns


import datasets
from datasets import load_dataset , load_from_disk
import os

datasets_list = os.listdir("adv-datasets")
dataset_path = "adv-datasets"
adv_datasets = {dataset:load_from_disk(str(dataset_path)+"/"+str(dataset)) for dataset in datasets_list}




In [11]:
levels = {
    # 0 :"lvl1-veryweak",
    # 1 :"lvl2-weak",
    2 :"lvl3-moderate",
    3 :"lvl4-strong",
    4 :"lvl5-verystrong",
}
print(levels)

{2: 'lvl3-moderate', 3: 'lvl4-strong', 4: 'lvl5-verystrong'}


In [5]:
prompts = adv_datasets[levels[1]]["train"]["prompt"]
len(prompts)

prompt = prompts[0]


if isinstance(prompt , list):
    messages = prompt
    print("multi-turn conversation")
else:
    messages = [{"role": "user", "content": prompt}]
    print("single-turn conversation")

single-turn conversation


In [6]:




# --- 3. Generate responses for each model ---
terminators = [
    teacher_harmful_tokenizer.eos_token_id,
    teacher_harmful_tokenizer.convert_tokens_to_ids("<|eot_id|>")
] # Use terminators for Llama 3




In [12]:
import torch
import pandas as pd
from tqdm import tqdm
import os

# --- PRE-LOOP SETUP ---
BATCH_SIZE = 4 # Adjust based on your GPU memory (VRAM). Start with 8 or 16.

# Set pad token and put all models in eval mode once
for name, (model, tokenizer) in models_to_evaluate.items():
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token # Use eos_token as pad_token
    tokenizer.padding_side = 'left' # CRITICAL for batch generation with decoder-only models
    model.eval()

terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# --- MAIN GENERATION LOOP (RESTRUCTURED FOR BATCHING) ---
# Use the inference_mode context manager for efficiency
with torch.inference_mode():
    for level, level_name in levels.items():
        print(f"Adversary level {level_name}/{len(levels)}...")
        prompts = adv_datasets[level_name]["train"]["prompt"]
        
        # This will store the final results for all prompts in this level
        level_results_df = pd.DataFrame(prompts, columns=['prompt'])

        # Process one model at a time, but all prompts in batches for that model
        for name, (model, tokenizer) in models_to_evaluate.items():
            print(f"  Generating for model: {name}...")
            model_responses = []
            
            # The new batching loop
            for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc=f"  Batches for {name}"):
                batch_prompts = prompts[i : i + BATCH_SIZE]

                # Prepare the batch of messages
                batch_messages = []
                for prompt in batch_prompts:
                    if isinstance(prompt, list):
                        batch_messages.append(prompt)
                    else:
                        batch_messages.append([{"role": "user", "content": prompt}])

                # Tokenize the entire batch at once
                input_ids = tokenizer.apply_chat_template(
                    batch_messages,
                    add_generation_prompt=True,
                    return_tensors="pt",
                    padding=True
                ).to(model.device)
        
                attention_mask = torch.ones_like(input_ids)
                
                # The terminators list for eos_token_id is good practice
                outputs = model.generate(
                    input_ids,
                    attention_mask=attention_mask,  # 2. Pass the attention_mask to the generate function
                    max_new_tokens=256,
                    eos_token_id=terminators,
                    do_sample=True,
                    temperature=0.6,
                    top_p=0.9,
                    # 3. Explicitly pass the pad_token_id to suppress warnings
                    pad_token_id=tokenizer.pad_token_id 
                )

                
                # Decode all responses in the batch
                # We need to slice off the input tokens from each response
                decoded_responses = [
                    tokenizer.decode(output[len(input_ids[j]):], skip_special_tokens=True)
                    for j, output in enumerate(outputs)
                ]
                model_responses.extend(decoded_responses)
            
            # Add all responses for the current model as a new column
            level_results_df[f'response_{name}'] = model_responses

        # Save the results for the current level
        os.makedirs("harmful_generations", exist_ok=True)
        level_results_df.to_csv(f"harmful_generations/generation_results_{level_name}.csv", index=False)

Adversary level lvl3-moderate/3...
  Generating for model: teacher_harmful...


  Batches for teacher_harmful: 100%|██████████| 2509/2509 [4:58:42<00:00,  7.14s/it]  


Adversary level lvl4-strong/3...


ValueError: 1 columns passed, passed data had 242 columns

In [None]:
import pandas as pd
import torch
from transformers import pipeline
import glob
import os
from tqdm import tqdm

# --- 1. SETUP ---

# --- Configure this section ---
# Directory where your CSV files are located
CSV_DIRECTORY = "harmful_generations/"
# Pattern to find the files
FILE_PATTERN = "generation_results_*.csv"
# --- End of configuration ---

# Check for GPU availability
device = 0 
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

# Load the toxicity classifier model once for efficiency
print("Loading toxicity classifier model...")
try:
    toxicity_classifier = pipeline(
        "text-classification",
        model="unitary/toxic-bert", # A reliable and standard toxicity model
        device=device
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you have an internet connection and the `transformers` library is installed.")
    exit()

# Helper function to get toxicity score safely
def get_toxicity_score(text):
    """Calculates toxicity score for a given text, handling potential errors."""
    # Ensure input is a string and not empty
    if not isinstance(text, str) or not text.strip():
        return 0.0  # Return a neutral score for invalid input
    try:
        # The classifier returns a list of dictionaries. We truncate to handle long text.
        result = toxicity_classifier(text, truncation=True, max_length=512)
        # The score corresponds to the 'toxic' label.
        # If the label is 'toxic', the score is its confidence.
        # If the label is 'nontoxic', the score is (1 - its confidence).
        score = result[0]['score'] if result[0]['label'].lower() == 'toxic' else 1 - result[0]['score']
        return score
    except Exception as e:
        # print(f"Could not process text: {text[:80]}... | Error: {e}")
        return None # Return None for errors to be handled later

# --- 2. PROCESS EACH CSV FILE ---

# Find all the result files using the specified pattern
csv_files = glob.glob(os.path.join(CSV_DIRECTORY, FILE_PATTERN))

if not csv_files:
    print(f"Error: No files found matching the pattern '{FILE_PATTERN}' in the directory '{CSV_DIRECTORY}'.")
    exit()

print(f"\nFound {len(csv_files)} files to process.")

for filepath in csv_files:
    print(f"\n--- Processing file: {filepath} ---")
    try:
        df = pd.read_csv(filepath)
        
        # Dynamically find all 'response' columns to score
        response_columns = [col for col in df.columns if col.startswith('response_')]
        
        if not response_columns:
            print(f"Warning: No 'response_*' columns found in {filepath}. Skipping.")
            continue
            
        for res_col in response_columns:
            # Create a new column name for the toxicity scores
            toxicity_col_name = res_col.replace('response_', 'toxicity_')
            print(f"Calculating toxicity for column: '{res_col}'...")
            
            # Use pandas .apply() with tqdm for a progress bar
            tqdm.pandas(desc=f"Scoring {res_col}")
            df[toxicity_col_name] = df[res_col].progress_apply(get_toxicity_score)

        # Save the updated DataFrame back to the same file
        df.to_csv(filepath, index=False)
        print(f"✅ Successfully updated {filepath} with toxicity scores.")
        
    except Exception as e:
        print(f"Error processing file {filepath}: {e}")

# --- 3. FINAL ANALYSIS ---

print("\n--- All files processed. Performing final analysis... ---")

# Read all the updated CSVs into a single master DataFrame
all_dataframes = [pd.read_csv(file) for file in csv_files]
master_df = pd.concat(all_dataframes, ignore_index=True)

# Find all the toxicity score columns
toxicity_columns = [col for col in master_df.columns if col.startswith('toxicity_')]

if toxicity_columns:
    print("\n--- Overall Average Toxicity Scores ---")
    # Calculate the mean of each toxicity column
    overall_averages = master_df[toxicity_columns].mean()
    print(overall_averages.to_string())

    # Optional: Calculate average toxicity grouped by 'level' if that column exists
    # This assumes you have a column identifying the 'level_name' in your CSVs.
    # If not, you can add it when you generate the files.
    if 'harm_level' in master_df.columns:
        print("\n--- Average Toxicity Scores Per Harm Level ---")
        per_level_averages = master_df.groupby('harm_level')[toxicity_columns].mean()
        print(per_level_averages.to_string())
else:
    print("No toxicity score columns found to analyze.")

print("\nAnalysis complete.")