In [None]:
from init_paths import (
    PROJECT_ROOT,
    DATASETS_DIR,
    ANALYSIS_DIR,
    EXPERIMENTS_DIR,
    IMAGES_DIR,
    GEN_DATA_DIR,
    LOGS_DIR,
    get_project_path
)

import pandas as pd
import sys
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os 
import argparse


from analysis.supervised import *
from analysis.causal_interventions import *
from analysis.Probes import SimpleMLP
from gen_data.harmBench_autograder import *
from analysis.models import *



def grade(autograder, autogr_tokenizer, df, request_column, column_to_grade, output_path):  
  # Apply the grading function and add the ratings column
  df[f"rating_{column_to_grade}"] = df.apply(lambda row: grade_with_HF(row[request_column], [row[column_to_grade]], autograder, autogr_tokenizer, verbose = True)[0], axis=1)
  print("rated")
  # Save the dataframe to the output path
  df.to_pickle(output_path)
  print(output_path)
  return df



def get_baseline(model, model_name, tokenizer, harmful, autograder, autogr_tokenizer, results_csv_path, single = ""):
    print("get baseline...")
    
    # Create the directory path
    base_path = Path(f"/data/nathalie_maria_kirch/ERA_Fellowship/datasets/{model_name}/final/harmful_{single}/")
    base_path.mkdir(parents=True, exist_ok=True) 
  
    harmful_map = {
    "harmful": 0,
    "harmless": 1,
    "haize": 2
    }
    harmful_v = harmful_map[harmful]
        
    validation_path = [
        "/data/nathalie_maria_kirch/ERA_Fellowship/datasets/validation/validation_data.pkl",
        "/data/nathalie_maria_kirch/ERA_Fellowship/datasets/validation/validation_data_benign.pkl", 
        "/data/nathalie_maria_kirch/ERA_Fellowship/datasets/haize/jailbreak_prompts_nathalie_cas_haize_1_short.pkl"
    ][harmful_v]
    
    df = pd.read_pickle(validation_path)
  
    df["control_no_intervention_response"] = None  

    # Run on every element of the sample
    
    model.cuda()
    for index, row in df.iterrows():
      
      prompt = row["jailbreak_prompt_text"]
      print("prompt: ", prompt)
      
      input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

      outputs = model.generate(**input_ids, max_new_tokens=200)

      response = tokenizer.decode(outputs[0])
      
      
      if response.startswith(prompt): response = response[len(prompt):].strip()
      if response.startswith("<|begin_of_text|>"+prompt): response = response[len("<|begin_of_text|>"+prompt):].strip()
      if response.startswith("<bos>"+prompt): response = response[len("<bos>"+prompt):].strip()

      df.at[index, "control_no_intervention_response"] = response
      print(response)

      # Save periodically
      if index % 20 == 0:
        folder_path = Path(f"/data/nathalie_maria_kirch/ERA_Fellowship/datasets/{model_name}/final/{harmful}/")
        os.makedirs(folder_path, exist_ok=True)
        df.to_pickle(folder_path / f"no_intervention_control.pkl")
    
    df.to_pickle(folder_path / f"no_intervention_control.pkl")
    print(f"Data saved to {str(folder_path / f"no_intervention_control.pkl")}")
    model.cpu()
    
    autograder.cuda()
    
    results_df = grade(
      autograder=autograder,
      autogr_tokenizer=autogr_tokenizer,
      df=df,
      request_column="jailbreak_prompt_text",
      column_to_grade=f"control_no_intervention_response",
      output_path= f"/data/nathalie_maria_kirch/ERA_Fellowship/datasets/{model_name}/final/harmful/no_intervention_control_rated.pkl"
      )
    autograder.cpu()
    
                    
    # Calculate ASR
    rating_col = f"rating_control_no_intervention_response"
    asr = results_df[rating_col].value_counts().get("Yes", 0)
    print(f"ASR: {asr}")
    
    # Create results dictionary
    new_result = {
        "asr": asr,
        "model": model_name,
        "probe_type": "control",
        "offensive_defensive": None,
        "c_strength": None,
        "lr": None
    }

    # Handle CSV operations
    if results_csv_path.exists():
        existing_df = pd.read_csv(results_csv_path)
        # Convert the new result to a single-row DataFrame
        new_result_df = pd.DataFrame([new_result])
        # Concatenate with existing results
        combined_df = pd.concat([existing_df, new_result_df], ignore_index=True)
    else:
        # Create new DataFrame with just the new result
        combined_df = pd.DataFrame([new_result])

    # Save to CSV
    combined_df.to_csv(results_csv_path, index=False)
    print(f"Updated results saved to {results_csv_path}")
    
    return asr

def main(model_name, single= "control"):   
  model_name, num_layers, rating_column, hf_ident, input_size  =  get_model_details(model_name)

  ## Get middle layer
  middle_layer = num_layers // 2

  ## CONFIG
  model = AutoModelForCausalLM.from_pretrained(hf_ident)
  tokenizer = AutoTokenizer.from_pretrained(hf_ident)

  autogr_name = "cais/HarmBench-Llama-2-13b-cls"
  autograder = AutoModelForCausalLM.from_pretrained(autogr_name)
  autogr_tokenizer = AutoTokenizer.from_pretrained(autogr_name, use_fast=False, truncation_side="left", padding_side="left")

  get_baseline(model = model, model_name = model_name, tokenizer= tokenizer, harmful = "harmful", autograder = autograder, autogr_tokenizer = autogr_tokenizer, results_csv_path = DATASETS_DIR / f"{model_name}/causal_intervention_parameters_{single}.csv")
      


In [None]:
main("Llama-3.2-3B-Instruct")