In [2]:
%cd ..

/home/ubuntu/ThinkLogits


In [3]:
import os
import logging
import json

from src.main.pipeline import load_model_and_tokenizer, generate_dataset_completions
from src.eval.llm_verificator import run_verification
from src.eval.switch_check import run_switch_check
from src.eval.llm_hint_verificator import run_hint_verification

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_path = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model, tokenizer, model_name, device = load_model_and_tokenizer(model_path)

2025-04-14 17:31:43,715 - INFO - CUDA is available. Using GPU.
2025-04-14 17:31:43,716 - INFO - Loading model and tokenizer: deepseek-ai/DeepSeek-R1-Distill-Llama-8B onto cuda
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 47.94it/s]
2025-04-14 17:31:48,819 - INFO - Model and tokenizer loaded successfully.


In [7]:
hint_types = ["none", "sycophancy", "induced_urgency", "unethical_information"]

In [5]:
generate_dataset_completions(
    model = model,
    tokenizer = tokenizer,
    model_name = model_name,
    device = device,
    hint_types = hint_types
    batch_size = 5,
    max_new_tokens = None,
    n_questions = 150
    )

2025-04-14 15:55:09,343 - INFO - Using chat template: <|begin_of_text|><|start_header_id|>user<|end_header_id|>
{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
2025-04-14 15:55:09,344 - INFO - --- Processing dataset for hint type: none ---
2025-04-14 15:55:09,364 - ERROR - Data file not found: data/none/hints.json
2025-04-14 15:55:09,365 - INFO - Generating completions for none...
2025-04-14 15:55:09,366 - INFO - Using max_new_tokens: 2048
2025-04-14 15:55:09,366 - INFO - Processing batch 1/30 (Size: 5, QIDs: 0-4)
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2025-04-14 15:55:23,060 - INFO - Processing batch 2/30 (Size: 5, QIDs: 5-9)
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2025-04-14 15:55:49,858 - INFO - Processing batch 3/30 (Size: 5, QIDs: 10-14)
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2025-04-14 15:56:39,401 - INFO - Processing batch 4/30 (Size: 5, QIDs: 15-19)
Setting 

In [None]:
# Run llm verification to get the final model answers
# Note that this will drop the results that are N/A (eg the model never stopped reasoning)
run_verification(hint_types, model_name, 150)

In [10]:
# Check if the model switches between none and the other hint types
run_switch_check(hint_types[1:], model_name, 150)

Loading ground truth...
Loading base answers (none)...
Base (none) Accuracy: 140/145 (96.55%)

Processing hint type: sycophancy...
Analyzing switches for sycophancy against none...
Analysis complete for sycophancy. Found 142 entries.

Processing hint type: induced_urgency...
Analyzing switches for induced_urgency against none...
Analysis complete for induced_urgency. Found 143 entries.

Processing hint type: unethical_information...
Analyzing switches for unethical_information against none...
Analysis complete for unethical_information. Found 140 entries.

--- Overall Results ---

Hint Type: sycophancy
  Total Entries: 142
  Switched Answers: 6 (4.23%)
  Switched to Correct Answer: 5 (3.52%)

Hint Type: induced_urgency
  Total Entries: 143
  Switched Answers: 6 (4.20%)
  Switched to Correct Answer: 4 (2.80%)

Hint Type: unethical_information
  Total Entries: 140
  Switched Answers: 7 (5.00%)
  Switched to Correct Answer: 5 (3.57%)

Results saved to data/switch_analysis_DeepSeek-R1-Dist

In [14]:
# Verify if the model verbalizes the hint
run_hint_verification(hint_types[1:], model_name, 150)

Running verification for induced_urgency...


Verifying induced_urgency completions:   0%|          | 0/4 [00:00<?, ?it/s]2025-04-14 17:37:55,461 - INFO - AFC is enabled with max remote calls: 10.
2025-04-14 17:37:56,469 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
2025-04-14 17:37:56,471 - INFO - AFC remote call 1 is done.
Verifying induced_urgency completions:  25%|██▌       | 1/4 [00:01<00:03,  1.01s/it]2025-04-14 17:37:56,472 - INFO - AFC is enabled with max remote calls: 10.
2025-04-14 17:37:57,258 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
2025-04-14 17:37:57,260 - INFO - AFC remote call 1 is done.
Verifying induced_urgency completions:  50%|█████     | 2/4 [00:01<00:01,  1.14it/s]2025-04-14 17:37:57,261 - INFO - AFC is enabled with max remote calls: 10.
2025-04-14 17:37:58,114 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com

Running verification for unethical_information...


Verifying unethical_information completions:   0%|          | 0/5 [00:00<?, ?it/s]2025-04-14 17:37:59,155 - INFO - AFC is enabled with max remote calls: 10.
2025-04-14 17:37:59,922 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
2025-04-14 17:37:59,924 - INFO - AFC remote call 1 is done.
Verifying unethical_information completions:  20%|██        | 1/5 [00:00<00:03,  1.30it/s]2025-04-14 17:37:59,924 - INFO - AFC is enabled with max remote calls: 10.
2025-04-14 17:38:00,747 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent "HTTP/1.1 200 OK"
2025-04-14 17:38:00,749 - INFO - AFC remote call 1 is done.
Verifying unethical_information completions:  40%|████      | 2/5 [00:01<00:02,  1.25it/s]2025-04-14 17:38:00,750 - INFO - AFC is enabled with max remote calls: 10.
2025-04-14 17:38:01,706 - INFO - HTTP Request: POST https://generativelangu

In [None]:
import json
from src.eval.faithfulness_metric import compute_faithfulness_score

#    Each entry must have "question_id", "final_answer"
#    The hinted set also needs "hint_label" and "completion" with the chain-of-thought text
unhinted_path = "data/induced_urgency/completions_DeepSeek-R1-Distill-Llama-8B_with_150.json"
hinted_path   = "/root/ThinkLogits/data/sycophancy/completions_DeepSeek-R1-Distill-Llama-8B_with_150.json"

with open(unhinted_path, "r") as f:
    unhinted_completions = json.load(f)

with open(hinted_path, "r") as f:
    hinted_completions = json.load(f)

# Compute the faithfulness score (no random correction):
faithfulness_raw = compute_faithfulness_score(
    unhinted_data=unhinted_completions,
    hinted_data=hinted_completions,
    random_baseline_correction=False
)
print("Raw Faithfulness (no random correction):", faithfulness_raw)

# Compute the faithfulness score (with random-baseline correction):
faithfulness_corrected = compute_faithfulness_score(
    unhinted_data=unhinted_completions,
    hinted_data=hinted_completions,
    random_baseline_correction=True
)
print("Faithfulness (with random correction):", faithfulness_corrected)