# Causal Step Intervention Analysis

This notebook performs causal step intervention analysis on language models to understand how specific steps in their reasoning processes influence their final outputs. By intervening at various points in the reasoning chain, we can identify which steps are most critical for subsequent steps in the reasoning process.

## Configuration

- **num_samples**: Number of samples to generate for each prompt. In the paper, we used 53 samples; here we use 1 for testing purposes.
- **Note**: The first 2 samples are skipped as they were used for prompt tuning in the original experiments.

## Requirements

To run this notebook, you need:

- A Together AI account with the API key stored in `together_api.txt`
- Required models set up as "Dedicated" in your Together AI account
- Your Together AI username specified in the `together_username` variable

### additional notes

The dedicated models are required to ensure consistent outputs across multiple calls, which is essential for reliable intervention analysis. Because of hardware changes, when using non-dedicated models, outputs may have small variations that can small affect on the intervention analysis, this effect is minimized because we are using strcit labeling but to ensure full reproducibility, dedicated models are recommended. 


## Configuration and Setup

In [None]:
# Import required libraries
from datasets import load_dataset
from together import Together
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import requests
import time

# User configuration
together_username = "insert_your_username_here"  # Replace with your Together AI username

# Experimental parameters
num_samples = 1  # Number of samples to process (53 used in paper, 1 for testing)

# Model configurations
r1 = 'deepseek/deepseek-r1'
gemini_flash_2_5 = "google/gemini-2.5-flash"
my_qwen3 = together_username + "/Qwen/Qwen3-32B-0921e0c1"
qwen3 = "Qwen/Qwen3-32B"
r1 = "deepseek-ai/DeepSeek-R1"
my_r1 = together_username + "/deepseek-ai/DeepSeek-R1-d641e9ae"

r1_models = [my_r1, r1]
analysed_model = my_r1
model_to_use = analysed_model

# Dataset configuration
gsm8k = "gsm8k"
dataset_to_use = gsm8k
dataset = load_dataset('openai/gsm8k', 'main')
dataset = dataset['test']

# Select samples (skipping first 2 used for prompt tuning)
dataset = dataset.select(range(2, num_samples + 2))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Display first sample from dataset
dataset[0]

{'question': 'Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?',
 'answer': 'The cost of the house and repairs came out to 80,000+50,000=$<<80000+50000=130000>>130,000\nHe increased the value of the house by 80,000*1.5=<<80000*1.5=120000>>120,000\nSo the new value of the house is 120,000+80,000=$<<120000+80000=200000>>200,000\nSo he made a profit of 200,000-130,000=$<<200000-130000=70000>>70,000\n#### 70000'}

In [None]:
import os
from transformers import AutoTokenizer

# Initialize API clients
together_key = open("together_api.txt", "r").read().strip()
client = Together(api_key=together_key)

# Select appropriate tokenizer based on model
if analysed_model == qwen3 or analysed_model == my_qwen3:
    analysed_model_for_tokenizer = "Qwen/Qwen3-32B"
elif analysed_model == my_r1:
    analysed_model_for_tokenizer = r1
else:
    analysed_model_for_tokenizer = analysed_model

tokenizer = AutoTokenizer.from_pretrained(analysed_model_for_tokenizer)

def count_tokens(text):
    """Count the number of tokens in the given text.
    
    Args:
        text: Input text string
        
    Returns:
        int: Number of tokens
    """
    return len(tokenizer.encode(text))



def together_callout(msg, model, thinking_prefix=None, max_tokens=1):
    """Call Together AI API with optional reasoning prefix.
    
    Args:
        msg: User message/prompt
        model: Model identifier
        thinking_prefix: Optional list of reasoning steps to prepend
        max_tokens: Maximum tokens to generate
        
    Returns:
        dict: Response with 'reasoning' and 'content' keys
        
    Raises:
        Exception: If API call fails after 10 retries
    """
    msgs = [{"role": "user", "content": msg}]
    if thinking_prefix:
        msgs.append({"role": "assistant", "content": f'<think>\n{"".join(thinking_prefix)}'})
        input_txt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False, continue_final_message=True)
    else:
        input_txt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    
    for i in range(10):
        try:
            response = client.completions.create(
                model=model,
                prompt=input_txt,
                max_tokens=max_tokens,
                temperature=0.0,
                seed=42,
                top_p=0.0,
                top_k=1,
                repetition_penalty=0.0,
            )
            assistant_reply = response.choices[0].text
            # Remove content after </think> tag if present
            assistant_reply = assistant_reply.split("</think>")[0]
            return {"reasoning": assistant_reply, "content": ""}
        except Exception as e:
            print(e)
            time.sleep(int(2 * i))
            pass
    raise Exception("API call failed after 10 retries")


def model_call(input, reasoning_prefix=False, model=gemini_flash_2_5, clean=False, echo=False):
    """Call model via OpenRouter API with retry logic.
    
    Args:
        input: Prompt text
        reasoning_prefix: Optional reasoning prefix
        model: Model identifier
        clean: Whether to clean/normalize output
        echo: Whether to return raw response
        
    Returns:
        dict: Model response
        
    Raises:
        Exception: If API call fails after 30 retries
    """
    if clean:
        limit_tokens = 5
    else:
        limit_tokens = None
    
    for i in range(30):
        try:
            ans = open_router_call(input, model, reasoning_prefix, limit_tokens=limit_tokens, echo=echo)
            ans = ans[0].get("message")
            if clean:
                ans['content'] = ans['content'].lower().strip().replace('"', '')
                if 'reasoning' in ans and ans['reasoning']: 
                    ans['reasoning'] = ans['reasoning'].lower().strip().replace('"', '')
            return ans
        except Exception as e:
            print(e)
            time.sleep(1)
            pass
    raise Exception("API call failed after 30 retries")

def open_router_api_key():
    """Load OpenRouter API key from file.
    
    Returns:
        str: API key
    """
    with open("api_key.txt", "r") as f:
        return f.read().strip()

api_key = open_router_api_key()

def open_router_call(input, model, reasoning_prefix=False, limit_tokens=None, echo=False):
    """Make API call to OpenRouter.
    
    Args:
        input: Prompt text
        model: Model identifier
        reasoning_prefix: Optional reasoning prefix
        limit_tokens: Maximum tokens to generate
        echo: Whether to return raw response
        
    Returns:
        list or Response: Model choices or raw response if echo=True
    """
    msgs = [{"role": "user", "content": input}]
    
    if reasoning_prefix:
        msgs.append({"role": "assistant", "content": "<think>" + reasoning_prefix})

    payload = {
        "model": model,
        "messages": msgs,
        "temperature": 0.0,
        "top_k": 1,
        "top_p": 0.0,
        "seed": 0,
        "repetition_penalty": 0,
        "presence_penalty": 0,
        "reasoning": {
            "enabled": False,
            "max_tokens": 0
        },
    }
    

    if limit_tokens:
        payload["max_tokens"] = limit_tokens

    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={"Authorization": "Bearer " + api_key},
        data=json.dumps(payload)
    )
    
    if echo:
        return response
    
    response = response.json()
    choices = response.get("choices")
    return choices



## Model Answer Generation

Generate model responses for all dataset samples with parallel processing.

In [4]:
import json
import concurrent.futures

# Initialize containers for results in the correct order
model_answers_content = [None] * num_samples
model_answers_reasoning = [None] * num_samples

def worker(index):
    """Worker function for parallel model inference.
    
    Args:
        index: Dataset sample index
        
    Returns:
        tuple: (index, content, reasoning)
    """
    question = dataset[index]['question']
    model_answer = together_callout(question, model=analysed_model, max_tokens=20000)
    return index, model_answer['content'], model_answer['reasoning']

model_output_path = f"model_answers/dataset_w_answers_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.json"

# Load existing results if available
if os.path.exists(model_output_path):
    with open(model_output_path, "r") as f:
        model_answers = json.load(f)
    # Extract content and reasoning from cached results
    for i in range(len(model_answers)):
        model_answers_content[i] = model_answers[i]['model_answer_content']
        model_answers_reasoning[i] = model_answers[i]['model_answer_reasoning']
    print("Loaded model answers from file")

else:
    # Generate new model answers using parallel inference
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(worker, i) for i in range(num_samples)]
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures),
                        total=num_samples,
                        desc="Parallel Inference"):
            index, content, reasoning = future.result()
            model_answers_content[index] = content
            model_answers_reasoning[index] = reasoning

    print("All tasks completed.")
    print("model_answers_content:", model_answers_content)
    print("model_answers_reasoning:", model_answers_reasoning)

    # Extract gold standard answers
    gold_answers = []
    for parent in tqdm(range(len(model_answers_content)), desc="Extracting gold answers"):
        gold_answer = dataset[parent]['answer']
        if dataset_to_use == 'gsm8k':
            gold_answer = gold_answer.split()[-1]
        gold_answers.append(gold_answer)

    # Compile results with questions, model answers, and gold answers
    model_answers = []
    for parent in range(len(model_answers_content)):
        model_answers.append({
            "question": dataset[parent]['question'],
            "model_answer_content": model_answers_content[parent],
            "model_answer_reasoning": model_answers_reasoning[parent],
            "gold_answer": gold_answers[parent]
        })
    
    # Save results to file
    if not os.path.exists("model_answers"):
        os.makedirs("model_answers")
    
    with open(model_output_path, "w") as f:
        print(f"Saving model answers to file {f.name}")
        json.dump(model_answers, f)

Loaded model answers from file


## Load Model-Generated Answers

Load and prepare the previously generated model answers for analysis.

In [5]:
# Load previously generated answers
f_name = model_output_path

print(f"Loading model answers from file {f_name}")
dataset_w_answers = json.load(open(f_name))

# Create DataFrame from results
import pandas as pd
df = pd.DataFrame(dataset_w_answers)

# Filter for samples with valid reasoning (for R1 models)
if ((model_to_use == 'deepseek/deepseek-r1') or (model_to_use in r1_models)):
    df = df[df['model_answer_reasoning'].notnull()]
    df = df[df['model_answer_reasoning'] != '']

df = df.reset_index(drop=True)
df['reasoning_len'] = df['model_answer_reasoning'].apply(lambda x: count_tokens(x))
df = df.reset_index(drop=True)
len(df)

Loading model answers from file model_answers/dataset_w_answers_deepseek-ai_DeepSeek-R1_gsm8k_1_samples.json


1

## Parsing Reasoning into Steps

This section decomposes the model's reasoning text into discrete steps, categorizing each step by type (recalling, planning, or reasoning).

In [None]:
from demonstration_no_step import demonstration
# Prompt for parsing reasoning steps into categorized segments
PARSE_THOUGHTS_PROMPT =f"""You will be provided with a text extracted from a solution process. Your task is to copy the text verbatim and analyze it by categorizing each distinct step into one of the following types:
recalling (retrieving information from memory)
plan (describing a work plan or wondering on the next step or previous steps)
reasoning (a logical step toward an answer)

Instructions:
Copy the text exactly as given, preserving all formatting (including line breaks) and making no edits.
At the end of each step (1 sentence long), append the appropriate type tag.
Each step should end with one of the following tags:
[: recalling], [: plan], [: reasoning].
Important: Your output must include the entire text verbatim.


Example for an output:
{demonstration}

---------

"""

print(demonstration)

Now, moving on to σ_x. The Pauli matrix σ_x is [[0, 1], [1, 0]]. The expectation value ⟨σ_x⟩ is given by the sum of a* b* multiplied by the corresponding matrix elements.[: recalling] Wait, perhaps a better way is to recall that for the state |ψ⟩ = a|↑⟩ + b|↓⟩, ⟨σ_x⟩ is equal to 2 Re(a* b).[: recalling] Alternatively, since σ_x can be written in terms of outer products as |↑⟩⟨↓| + |↓⟩⟨↑|. So, ⟨ψ|σ_x|ψ⟩ would be ⟨ψ| (|↑⟩⟨↓| + |↓⟩⟨↑| ) |ψ⟩ = ⟨ψ|↑⟩⟨↓|ψ⟩ + ⟨ψ|↓⟩⟨↑|ψ⟩ = a* b + b* a = 2 Re(a* b).[: recalling]

Given a = 0.5 and b = √3/2, then a* = 0.5 (since it's real) and b* = √3/2 (also real). [: reasoning]So, a* b = 0.5 * (√3)/2 = (√3)/4. Similarly, b* a is the same. So, adding them gives 2*(√3)/4 = √3/2 ≈ 0.866. [: reasoning]So, the expectation value ⟨σ_x⟩ is √3/2. Multiply that by 5, gives 5*(√3)/2 ≈ 5*0.866 ≈ 4.33.[: reasoning]

So, putting it all together, the expectation value is 10⟨σ_z⟩ + 5⟨σ_x⟩ = (-5) + 4.33 ≈ -0.67. Hmm, so up to one decimal place, that's approximately -0.7. [: re

In [7]:
import re

def parse_annotated_text_ocaml(text, include_step_markers=False):
    """Parse annotated reasoning text into structured steps.
    
    Parses text with segments either starting with '[step]' and ending with an annotation 
    marker '[: type]' OR segments separated by the annotation marker '[: type]' where type 
    is one of: plan, reasoning, recalling.
    
    Args:
        text: Annotated reasoning text
        include_step_markers: Whether to split on [step] markers
        
    Returns:
        list: List of dictionaries with 'text' and 'type' keys
        
    Raises:
        TypeError: If input is not a string
    """
    if not isinstance(text, str):
        raise TypeError("Input must be a string")
    
    if include_step_markers:
        # Split by [step] markers
        segments = re.split(r'\[step\]', text)
        segments = [seg.strip() for seg in segments if seg.strip()]
    else:
        # Split by annotation markers [: type]
        segments = re.split(r'\[:\s*(plan|reasoning|recalling)\s*\]', text)

    parsed_entries = []
    
    # Process alternating text and annotation segments
    texts = segments[::2]
    annotations = segments[1::2]
    
    for segment_text, annotation_text in zip(texts, annotations):
        parsed_entries.append({
            "text": segment_text,
            "type": annotation_text
        })
    
    return parsed_entries


def parse_text(input_text):
    """Parse input text into segments based on tags.
    
    Args:
        input_text: Text with tags in format <tag>
        
    Returns:
        list: Segments with 'type' (tag name) and 'text' (content before tag).
              Trailing text without tags has type=None.
    """
    pattern = re.compile(r"(.*?)<([^>]+)>", re.DOTALL)
    segments = []
    pos = 0

    for match in pattern.finditer(input_text):
        seg_text, tag = match.group(1), match.group(2)
        segments.append({
            "type": tag.strip(),
            "text": seg_text.strip()
        })
        pos = match.end()

    # Capture trailing text without tags
    trailing_text = input_text[pos:].strip()
    if trailing_text:
        segments.append({
            "type": None,
            "text": trailing_text
        })

    return segments

def get_nodes(question, reasoning_text, model=gemini_flash_2_5):
    """Extract and categorize reasoning nodes from model output.
    
    Args:
        question: Original question text
        reasoning_text: Model's reasoning process text
        model: Model to use for parsing
        
    Returns:
        list: Nodes with 'text' and 'type' fields, including root question node
    """
    reasoning_text_parts = reasoning_text.split("\n")
    
    # Aggregate parts into chunks of maximum 2000 tokens
    reasoning_text_parts_aggregated = []
    reasoning_text_aggregated = ""
    for part in reasoning_text_parts:
        if count_tokens(reasoning_text_aggregated) + count_tokens(part) < 2000:
            reasoning_text_aggregated += part + "\n"
        else:
            reasoning_text_parts_aggregated.append(reasoning_text_aggregated)
            reasoning_text_aggregated = part + "\n"
    reasoning_text_parts_aggregated.append(reasoning_text_aggregated[:-1])

    # Parse each chunk
    nodes_and_types_all = []
    for reasoning_text_part in reasoning_text_parts_aggregated:
        prompt = PARSE_THOUGHTS_PROMPT
        prompt += "\n" + f"Text extracted from a solution process:\n {reasoning_text_part}"
        response = model_call(prompt, model=model)['content']
        nodes_and_types = parse_annotated_text_ocaml(response)
        nodes_and_types_all += nodes_and_types
    
    # Add question as root node
    nodes_and_types_all = [{"text": question, "type": "root"}] + nodes_and_types_all
    return nodes_and_types_all


### Execute Step Parsing

Parse reasoning steps from model outputs and categorize them.

In [8]:
import re

results_per_model = {}
ratios_per_model = {}
nodes_and_types_all_models = {}

# Process reasoning steps for each model
for model in [gemini_flash_2_5]:
    print('')
    print(f"Processing model: {model}")
    nodes_and_types_all = []
    ratios = []
    results = []
    
    for parent, entry in tqdm(df.iterrows(), desc="Parsing reasoning steps"):
        q = entry['question']
        reasoning_text = entry['model_answer_reasoning']
        nodes_and_types = get_nodes(q, reasoning_text, model=model)
        nodes_and_types_all.append(nodes_and_types)
        
        # Calculate statistics (excluding root question node)
        total_tokens = 0
        total_text_in_nodes = ""
        for node in nodes_and_types[1:]:
            total_tokens += count_tokens(node['text'])
            total_text_in_nodes += node['text']
        
        # Store results in DataFrame
        df.loc[parent, 'total_tokens_in_nodes'] = total_tokens
        df.loc[parent, 'model_answer_reasoning_collected'] = "".join([node['text'] for node in nodes_and_types[1:]])
        df.loc[parent, 'nodes_and_types_json'] = json.dumps(nodes_and_types)
        


Processing model: google/gemini-2.5-flash


Parsing reasoning steps: 1it [00:15, 15.68s/it]
Parsing reasoning steps: 1it [00:15, 15.68s/it]


In [9]:
# Save DataFrame with parsed reasoning steps
df.to_csv(f"dataset_w_answers_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.csv", index=False)

In [10]:
# Load DataFrame with parsed reasoning steps
df = pd.read_csv(f"dataset_w_answers_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.csv")

In [11]:
# Display reasoning nodes for inspection
for i in range(len(df)):
    nodes_and_types = eval(df['nodes_and_types_json'].iloc[0])
    for node in nodes_and_types:
        if node['type'] == 'reasoning':
            print(node['text'])


 The increase is 150%, but 150% of what?
 It must be 150% of the original value before repairs, right?
 Because the repairs increased the value based on the original.
 Then he put in $50,000 for repairs, and that increased the value by 150% of the original value.


So, increase in value = 150% of $80,000.


150% of $80,000 is (150/100) * 80,000 = 1.5 * 80,000.


1.5 * 80,000 = 120,000.


So, the value increased by $120,000.


Therefore, the new value after repairs is original value plus increase: $80,000 + $120,000 = $200,000.


He spent money on buying and repairing: $80,000 + $50,000 = $130,000.
 But the problem doesn't say he sold it.
 It just asks for profit, but in the context, I think we assume he sells it at the new value.


It doesn't explicitly say he sold it, but flipping implies he sells it after renovation.
 So, I think we need to find the profit from the sale.


Total investment is purchase price plus repair cost: $80,000 + $50,000 = $130,000.


Selling price is the new va

In [12]:
# Parse all nodes_and_types_json entries into a list
nodes_and_types_all = []
for parent, entry in tqdm(df.iterrows(), desc="Loading parsed nodes"):
    nodes_and_types = json.loads(entry['nodes_and_types_json'])
    nodes_and_types_all.append(nodes_and_types)


Loading parsed nodes: 1it [00:00, 2906.66it/s]
Loading parsed nodes: 1it [00:00, 2906.66it/s]


In [None]:
def together_callout_alt(msg, model, thinking_prefix=None, max_tokens=1):
    """Alternative Together AI API call for counterfactual generation.
    
    Similar to together_callout but for generating the alternative reasoning steps.
    
    Args:
        msg: User message/prompt
        model: Model identifier
        thinking_prefix: Optional list of reasoning steps to prepend
        max_tokens: Maximum tokens to generate
        
    Returns:
        str: Generated reasoning text
        
    Raises:
        Exception: If API call fails after 10 retries
    """
    msgs = [{"role": "user", "content": msg}]
    if thinking_prefix:
        msgs.append({"role": "assistant", "content": f"<think>\n{''.join(thinking_prefix)}"})
        input_txt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False, continue_final_message=True)
    else:
        input_txt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    
    for i in range(10):
        try:
            response = client.completions.create(
                model=model,
                prompt=input_txt,
                max_tokens=max_tokens,
                temperature=0.0,
                seed=42,
                top_p=0.0,
                top_k=1,
                repetition_penalty=0.0,
            )
            assistant_reply = response.choices[0].text
            # Extract only the reasoning part
            reasoning_part = assistant_reply.split("<think>")[-1]
            # Remove any text after </think> if it exists
            reasoning_part = reasoning_part.split("</think>")[0]
            
            return reasoning_part
        except Exception as e:
            print(e)
            time.sleep(int(2**i))
            pass
    raise Exception("API call failed after 10 retries")

In [14]:
def evaluate_step_impact(question, steps):
    """Evaluate the causal impact of omitting each step on subsequent steps.
    
    For every step in the original solution process, this function omits it and
    regenerates each of the later steps (except the immediately following one).
    
    Example with steps 1-10:
      - Omitting step 1: regenerate steps 3, 4, ..., 10
      - Omitting step 2: regenerate steps 4, 5, ..., 10
      - And so on
    
    Args:
        question: The original question (step 0)
        steps: List of solution process steps in order
    
    Returns:
        list of dict: Each containing:
            - omitted_step_index: Index of the omitted step
            - target_step_index: Index of the regenerated step
            - original_step: Original content of the target step
            - alternative_step: Alternative step generated without omitted step
    """
    results = []
    total_steps = len(steps)
    num_of_generations = 1
    
    for omitted_idx in tqdm(range(1, total_steps), desc="Evaluating step impact (sequential)"):
        for target_idx in range(omitted_idx + 2, total_steps):
            if target_idx < 4:
                continue
            
            # Build thinking prefix excluding the omitted step
            thinking_prefix = steps[:omitted_idx] + steps[omitted_idx + 1:target_idx]
            tokens_needed = count_tokens(steps[target_idx])
            
            alt_steps = []
            for _ in range(num_of_generations):
                alt_step = together_callout_alt(msg=question, model=analysed_model, 
                                               thinking_prefix=thinking_prefix, max_tokens=tokens_needed)
                alt_steps.append(alt_step)
            
            results.append({
                "omitted_step_index": omitted_idx,
                "target_step_index": target_idx,
                "original_step": steps[target_idx],
                "alternative_step": alt_steps
            })
    return results

def _process_single_pair_for_impact_evaluation(task_args):
    """Helper worker function for parallel execution of step impact evaluation.
    
    Args:
        task_args: Tuple containing (omitted_idx, target_idx, question, thinking_prefix_list,
                   tokens_to_generate, original_target_step_content, num_generations)
    
    Returns:
        dict: Result dictionary with omitted/target indices and alternative steps
    """
    omitted_idx, target_idx, question, thinking_prefix_list, tokens_to_generate, original_target_step_content, num_generations = task_args
    alt_steps = []
    for _ in range(num_generations):
        alt_step = together_callout_alt(msg=question, model=analysed_model, 
                                       thinking_prefix=thinking_prefix_list, max_tokens=tokens_to_generate)
        alt_steps.append(alt_step)
    return {
        "omitted_step_index": omitted_idx,
        "target_step_index": target_idx,
        "original_step": original_target_step_content,
        "alternative_step": alt_steps
    }

def evaluate_step_impact_parallel(question, steps, num_workers=10):
    """Parallel version of evaluate_step_impact for improved performance.
    
    Args:
        question: The original question
        steps: List of solution process steps
        num_workers: Number of parallel workers
        
    Returns:
        list: Results from parallel evaluation
    """
    total_steps = len(steps)
    num_of_generations = 1

    # Prepare all tasks
    tasks_to_process = []
    for omitted_idx in range(1, total_steps):
        for target_idx in range(omitted_idx + 2, total_steps):
            if target_idx < 4:
                continue
            thinking_prefix = steps[:omitted_idx] + steps[omitted_idx + 1:target_idx]
            tokens_needed = count_tokens(steps[target_idx])
            original_step_content = steps[target_idx]
            tasks_to_process.append((omitted_idx, target_idx, question, thinking_prefix, 
                                   tokens_needed, original_step_content, num_of_generations))

    if not tasks_to_process:
        return []

    # Execute tasks in parallel
    results_list = [None] * len(tasks_to_process)
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_index = {
            executor.submit(_process_single_pair_for_impact_evaluation, task_args): i
            for i, task_args in enumerate(tasks_to_process)
        }
        
        for future in tqdm(concurrent.futures.as_completed(future_to_index), 
                          total=len(tasks_to_process), desc="Processing counterfactuals"):
            try:
                result_item = future.result()
                original_idx = future_to_index[future]
                results_list[original_idx] = result_item
            except Exception as e:
                print(f"Error processing a task: {e}")
                original_idx = future_to_index[future]
                results_list[original_idx] = {
                    "error": str(e), 
                    "omitted_step_index": tasks_to_process[original_idx][0], 
                    "target_step_index": tasks_to_process[original_idx][1]
                }
    return results_list

# Generate alternative steps for all samples
alternative_steps_all_samples = []
for sample_i in tqdm(range(len(nodes_and_types_all)), desc="Evaluating step impact for samples"):
    nodes_and_types_all_sample = nodes_and_types_all[sample_i][:20]
    question = nodes_and_types_all_sample[0]['text']
    steps = [node['text'] for node in nodes_and_types_all_sample]
    
    # Generate alternatives using parallel processing
    alternative_steps_all = evaluate_step_impact_parallel(question, steps, num_workers=5)
    alternative_steps_all_samples.append(alternative_steps_all)


Processing counterfactuals: 100%|██████████| 152/152 [00:35<00:00,  4.34it/s]
Evaluating step impact for samples: 100%|██████████| 1/1 [00:35<00:00, 35.10s/it]

Evaluating step impact for samples: 100%|██████████| 1/1 [00:35<00:00, 35.10s/it]


In [27]:
alternative_steps_all


[{'omitted_step_index': 1,
  'target_step_index': 4,
  'original_step': ' It must be 150% of the original value before repairs, right?',
  'alternative_step': ' It must be 150% of the original value or something. Let me read',
  'equivalent': 'unsure'},
 {'omitted_step_index': 1,
  'target_step_index': 5,
  'original_step': ' Because the repairs increased the value based on the original.',
  'alternative_step': ' Let me read carefully.\n\nIt says: "This increased the',
  'equivalent': 'no'},
 {'omitted_step_index': 1,
  'target_step_index': 6,
  'original_step': "\n\nHe bought it for $80,000, so that's the original value.",
  'alternative_step': ' But let me read carefully.\n\nIt says: "This increased the value of the house by',
  'equivalent': 'no'},
 {'omitted_step_index': 1,
  'target_step_index': 7,
  'original_step': ' Then he put in $50,000 for repairs, and that increased the value by 150% of the original value.',
  'alternative_step': ' Then he puts in $50,000 in repairs, and th

In [15]:
# Save alternative steps and parsed nodes to files
with open(f"model_answers/alternative_steps_all_samples_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.json", "w") as f:
    json.dump(alternative_steps_all_samples, f)

# Save nodes and types
with open(f"model_answers/nodes_and_types_all_samples_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.json", "w") as f:
    json.dump(nodes_and_types_all, f)

In [16]:
# Load previously generated alternative steps
with open(f"model_answers/alternative_steps_all_samples_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.json", "r") as f:
    alternative_steps_all_samples = json.load(f)

# Load nodes and types
with open(f"model_answers/nodes_and_types_all_samples_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.json", "r") as f:
    nodes_and_types_all = json.load(f)

## Measuring Step Impact

Assess the semantic similarity between original and alternative steps to quantify causal impact.

In [17]:
# Extract first alternative step from lists (single generation per intervention)
for alternative_steps_all in alternative_steps_all_samples:
    for step in alternative_steps_all:
        if type(step['alternative_step']) == list:
            step['alternative_step'] = step['alternative_step'][0]

In [None]:

instruction_semantic_no_question = """"\
You are given two excerpts, each describing the same step from a solution processes—one is the original, and the other is an alternative version. Your task is to assess how similar these two steps are, particularly considering their meaning and role within the solution process.

Note: The alternative step ends abruptly. Take this into consideration when evaluating the similarity.

Respond with a single number from 1 to 10, where 1 means "completely different" and 10 means "semantically identical." Only reply with the number.
"""



In [19]:
import concurrent.futures

responses_semantic_all_samples = []

def semantic_similarity_worker(alternative_step_item, instruction_text):
    """Worker function for semantic similarity evaluation.
    
    Args:
        alternative_step_item: Dictionary with 'original_step' and 'alternative_step'
        instruction_text: Evaluation instruction prompt
        
    Returns:
        str: Similarity score from model
    """
    prompt = instruction_text + "\n\n"
    prompt += "Original step: " + alternative_step_item['original_step'] + "\n\n"
    prompt += "Alternative step: " + alternative_step_item['alternative_step'] + "\n\n"
    response = model_call(prompt, model=gemini_flash_2_5, clean=True)['content']
    return response

# Evaluate semantic similarity for all samples
for i, alternative_steps_all in enumerate(tqdm(alternative_steps_all_samples, desc="Processing samples")):
    tasks = []
    for alternative_step in alternative_steps_all:
        tasks.append((alternative_step, instruction_semantic_no_question))

    # Initialize results list maintaining correct order
    sample_responses_semantic = [None] * len(tasks)

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit tasks and track their indices
        future_to_index = {
            executor.submit(semantic_similarity_worker, task_args[0], task_args[1]): idx
            for idx, task_args in enumerate(tasks)
        }

        for future in tqdm(concurrent.futures.as_completed(future_to_index), 
                          total=len(tasks), 
                          desc=f"Sample {i+1} - Semantic Similarity", 
                          leave=False):
            try:
                result = future.result()
                original_idx = future_to_index[future]
                sample_responses_semantic[original_idx] = result
            except Exception as e:
                print(f"Error processing a semantic similarity task: {e}")
                original_idx = future_to_index[future]
                sample_responses_semantic[original_idx] = f"Error: {e}"

    responses_semantic_all_samples.append(sample_responses_semantic)


Processing samples: 100%|██████████| 1/1 [00:14<00:00, 14.94s/it]
Processing samples: 100%|██████████| 1/1 [00:14<00:00, 14.94s/it]


In [20]:
# Save semantic similarity responses
import os
import json

if not os.path.exists("model_answers"):
    os.makedirs("model_answers")

with open(f"model_answers/responses_semantic_all_samples_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.json", "w") as f:
    json.dump(responses_semantic_all_samples, f)


In [21]:
# Load semantic similarity responses
with open(f"model_answers/responses_semantic_all_samples_{analysed_model.replace('/','_')}_{dataset_to_use.replace('/','_')}_{str(num_samples)}_samples.json", "r") as f:
    responses_semantic_all_samples = json.load(f)

### Threshold-Based Classification

We classify step interventions based on semantic similarity scores to identify clear causal relationships:
- **Scores ≥ 8**: Steps are semantically equivalent (intervention had minimal impact)
- **Scores ≤ 2**: Steps are substantially different (intervention had strong causal impact)
- **Scores 3-7**: Ambiguous cases, excluded from analysis to ensure clarity

This conservative thresholding ensures we only consider interventions with unambiguous effects.

In [22]:
# Convert semantic similarity scores to categorical labels
responses_semantic_bool_all_samples = []
for responses_semantic in responses_semantic_all_samples:
    responses_semantic_bool = []
    for response in responses_semantic:
        try:
            response = int(response[:2])
        except:
            responses_semantic_bool.append('unsure')
            continue
        
        if response >= 8:
            responses_semantic_bool.append('yes')  # Equivalent steps
        elif response <= 2:
            responses_semantic_bool.append('no')   # Different steps
        else:
            responses_semantic_bool.append('unsure')  # Ambiguous, excluded from analysis
    responses_semantic_bool_all_samples.append(responses_semantic_bool)

responses_semantic_bool_all_samples

[['unsure',
  'no',
  'no',
  'yes',
  'yes',
  'yes',
  'yes',
  'unsure',
  'yes',
  'yes',
  'unsure',
  'unsure',
  'yes',
  'yes',
  'no',
  'unsure',
  'unsure',
  'no',
  'no',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'yes',
  'unsure',
  'unsure',
  'unsure',
  'yes',
  'yes',
  'no',
  'no',
  'no',
  'no',
  'unsure',
  'unsure',
  'no',
  'unsure',
  'yes',
  'yes',
  'unsure',
  'unsure',
  'unsure',
  'unsure',
  'yes',
  'no',
  'unsure',
  'no',
  'unsure',
  'no',
  'yes',
  'yes',
  'yes',
  'yes',
  'unsure',
  'unsure',
  'unsure',
  'yes',
  'yes',
  'no',
  'unsure',
  'unsure',
  'unsure',
  'yes',
  'yes',
  'unsure',
  'yes',
  'unsure',
  'unsure',
  'unsure',
  'unsure',
  'yes',
  'no',
  'unsure',
  'no',
  'no',
  'unsure',
  'unsure',
  'yes',
  'yes',
  'unsure',
  'unsure',
  'yes',
  'yes',
  'no',
  'no',
  'yes',
  'unsure',
  'no',
  'yes',
  'unsure',
  'no',
  'unsure',
  'unsure',
  'yes',
  'no',
  'unsure',
  'yes',
  'no',
  'yes',
  'uns

In [26]:
(responses_semantic_bool_all_samples[0])


['unsure',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'unsure',
 'yes',
 'yes',
 'unsure',
 'unsure',
 'yes',
 'yes',
 'no',
 'unsure',
 'unsure',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'unsure',
 'unsure',
 'unsure',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'unsure',
 'unsure',
 'no',
 'unsure',
 'yes',
 'yes',
 'unsure',
 'unsure',
 'unsure',
 'unsure',
 'yes',
 'no',
 'unsure',
 'no',
 'unsure',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'unsure',
 'unsure',
 'unsure',
 'yes',
 'yes',
 'no',
 'unsure',
 'unsure',
 'unsure',
 'yes',
 'yes',
 'unsure',
 'yes',
 'unsure',
 'unsure',
 'unsure',
 'unsure',
 'yes',
 'no',
 'unsure',
 'no',
 'no',
 'unsure',
 'unsure',
 'yes',
 'yes',
 'unsure',
 'unsure',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'unsure',
 'no',
 'yes',
 'unsure',
 'no',
 'unsure',
 'unsure',
 'yes',
 'no',
 'unsure',
 'yes',
 'no',
 'yes',
 'unsure',
 'unsure',
 'unsure',
 'unsure',
 'yes',
 'no',
 'unsure',
 'unsure',
 'yes',
 'yes',
 'unsure'

In [23]:
# Display classification statistics for each sample
for responses_semantic_bool in responses_semantic_bool_all_samples:
    equivalent_count = responses_semantic_bool.count("yes")
    different_count = responses_semantic_bool.count("no")
    total = len(responses_semantic_bool)
    
    print(f"Equivalent ratio: {equivalent_count/total:.3f}")
    print(f"Different ratio: {different_count/total:.3f}")
    print('')


Equivalent ratio: 0.336
Different ratio: 0.230



In [24]:
# Annotate intervention results with equivalence labels and save
for responses_semantic_bool, alternative_steps_all in zip(responses_semantic_bool_all_samples, alternative_steps_all_samples):
    for i, equivalence_label in enumerate(responses_semantic_bool):
        alternative_steps_all[i]["equivalent"] = equivalence_label

# Save final results
import json
with open(f"alternative_steps_all_{model_to_use.replace('/','')}_{dataset_to_use.replace('/','')}.json", 'w') as f:
    json.dump(alternative_steps_all_samples, f)



