In [None]:
# Setup and Import Utilities
import pandas as pd
import numpy as np
import dotenv
import os
import openai

# Load environment variables (API keys)
dotenv.load_dotenv()
# openai.api_base = "https://api.deepinfra.com/v1/openai" # Uncomment and set if using DeepInfra/Llama 2
openai.api_key = os.getenv("OPENAI_API_KEY")

# Import our custom utilities
import black_box_utils as utils

# Configure pandas display
pd.set_option('display.max_colwidth', None)

# Define the model we are prepping for
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf" 
# MODEL_NAME = "text-davinci-003"


In [None]:
# 1. Load the Dataset
# We use the custom loader for CommonSense2 (binary questions)
dataset = utils.load_dataset()
print(f"Loaded {len(dataset)} questions from {dataset.default_processed_filename}")
dataset.head()


In [None]:
# 2. Check Model Capability (Can Answer?)
# We need to find N questions that the model answers CORRECTLY.
# If the model doesn't know the truth, we can't test if it's lying.

NUMBER_qs_TO_PREP = 100 # Adjust this based on how many valid samples you want finally

print(f"Checking if {MODEL_NAME} can answer correctly...")
dataset.check_if_model_can_answer(
    model=MODEL_NAME,
    max_questions_to_try=NUMBER_qs_TO_PREP,
    max_batch_size=10, # Lower batch size for local/smaller APIs
    save_progress=True,
    bypass_cost_check=True
)

# Show stats
answered_correctly = dataset[f"{MODEL_NAME}_can_answer"].sum()
attempted = dataset[f"{MODEL_NAME}_can_answer"].count()
print(f"Model answered correctly: {answered_correctly}/{attempted}")


In [None]:
# 3. Check Lie Capability & Double Down Rate
# We need to verify that our lie instructions actually work for this model.
# This step also calculates the 'double down rate' (consistency in lying).

print(f"Testing lie instructions on {MODEL_NAME}...")

# We calculate proper number of rows to try based on successful answers
valid_rows = np.where(dataset[f"{MODEL_NAME}_can_answer"] == True)[0]
n_rows_to_test = len(valid_rows)

if n_rows_to_test > 0:
    dataset.does_model_lie(
        model=MODEL_NAME,
        max_questions_to_try=min(50, n_rows_to_test), # Test on a subset of 50 valid questions
        lie_instructions=dataset.lie_instructions, # Test all configured instructions
        save_progress=True,
        bypass_cost_check=True
    )

    # Compute and display rates
    lying_ratio = dataset.compute_lying_ratio(model=MODEL_NAME, lie_instructions=dataset.lie_instructions)
    dd_rate = dataset.compute_lie_double_down_rate(model=MODEL_NAME, lie_instructions=dataset.lie_instructions)
    
    print("\nLying Success Rate per Instruction:")
    print(lying_ratio)
    print("\nDouble Down Rate per Instruction:")
    print(dd_rate)
else:
    print("No questions answered correctly, skipping lie test.")

In [None]:
# 4. Generate False Statements (Preprocessing for Consistency)
# For non-binary datasets, we pre-generate the lie text so we can verify it's actually false
# For binary (CommonSense2), this function handles the skipping automatically.

print(f"Generating false statements for {MODEL_NAME}...")
utils.generate_lies(dataset, model=MODEL_NAME, number_questions_to_answer=NUMBER_qs_TO_PREP)

# Verify the result
cols_to_show = ["question", "answer", "false_statement"]
if "false_statement" in dataset.columns:
    dataset[cols_to_show].head(3)


In [None]:
# 5. Final Status Check
# Show how many samples are fully ready (Answerable + (Optional) Lie Generated)

ready_mask = dataset[f"{MODEL_NAME}_can_answer"] == True
# If non-binary, also check false_statement
if not dataset.binary_questions and "false_statement" in dataset.columns:
    ready_mask = ready_mask & dataset["false_statement"].notna()

n_ready = ready_mask.sum()
print(f"\nTotal samples ready for detection experiment: {n_ready}")

# The dataset saves automatically to JSON during these calls.
# You can now proceed to Black_Box_Lie_Detection.ipynb and skip these steps!
