# Mirroring with Agents

### Main Components for the experiment

In [155]:
import joblib
import pandas as pd
from scipy.sparse import hstack
import os
from dotenv import load_dotenv
import os
from openai import OpenAI
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

load_dotenv()

api_key = "ge..."
client = OpenAI(api_key=api_key)

In [156]:
# This are the prompts for each LLM we have
ALL_PLACEHOLDERS = [
    "[INTERVENTION_PLACEHOLDER]",
    "[QUESTIONS_PLACEHOLDER]",
    "[ML_RATING_FEEDBACK_PLACEHOLDER]",
    "[LLM_FEEDBACK_PLACEHOLDER]"
]
def build_config(agent_dict, row, extra_placeholders={}):
    # Start with empty values for all placeholders
    placeholders = {ph: "" for ph in ALL_PLACEHOLDERS}

    # Always add intervention
    placeholders["[INTERVENTION_PLACEHOLDER]"] = row["intervention"]

    # Add whatever else is needed from the pipeline
    for key, val in extra_placeholders.items():
        if key in placeholders:
            placeholders[key] = val

    config = {
        "agent_type": agent_dict["name"],
        "prompt": agent_dict["prompt"],
        "description": agent_dict["description"],
        "intervention": row["intervention"],
        "placeholders": placeholders
    }
    return config


In [157]:
#================ QUERYING THE MODEL

def query_model(config):
    openai = OpenAI(
        api_key=api_key,
        base_url="https://api.deepinfra.com/v1/openai",
    )
    agent_type = config["agent_type"]
    prompt = config["prompt"]
    description = config["description"]
    intervention = config["intervention"]
    placeholders = config["placeholders"]

    # Always enforce the intervention placeholder to be accurate
    placeholders["[INTERVENTION_PLACEHOLDER]"] = intervention

    # Replace placeholders in prompt
    for placeholder, value in placeholders.items():
        prompt = prompt.replace(placeholder, value or f"<MISSING {placeholder}>")
    messages = [
    {"role": "system", "content": description},
    {"role": "user", "content": prompt}
    ]
    # Send request
    chat_completion = openai.chat.completions.create(
        model= "meta-llama/Meta-Llama-3.1-405B-Instruct", 
        messages=messages,
        temperature=0.0
    )
    response = chat_completion.choices[0].message.content

    return response, prompt




In [158]:
from openai import OpenAI
api_key2 = os.getenv("OPENAI_API_KEY")
client2 = OpenAI(api_key=api_key2)
def extract_CQ(response):
    
    # Send request
    prompt = f"Extract the 9 critical questions from the text below, returning only a Python list of the questions without numbering or formatting. For example, if the text contains '1. What is your name?' just include 'What is your name?' in the list.\n\nText: {response}"

    response = client2.chat.completions.create(
        messages=[
            {"role": "user", "content": prompt}
        ],
        model="o4-mini-2025-04-16"
    )
    
    # Get the string response
    response_text = response.choices[0].message.content
    
    # Try to parse it as a Python list
    try:
        # Handle the case where the model returns a properly formatted list
        import ast
        return ast.literal_eval(response_text)
    except (SyntaxError, ValueError):
        # If parsing fails, do some cleanup and try again
        # Remove common formatting issues
        cleaned = response_text.strip()
        if cleaned.startswith("```python"):
            cleaned = cleaned.split("```python", 1)[1]
        if cleaned.startswith("```"):
            cleaned = cleaned.split("```", 1)[1]
        if cleaned.endswith("```"):
            cleaned = cleaned.rsplit("```", 1)[0]
        
        cleaned = cleaned.strip()
        
        try:
            # Try parsing again
            return ast.literal_eval(cleaned)
        except (SyntaxError, ValueError):
            # If still failing, split by newlines or other common separators
            if "[" in cleaned and "]" in cleaned:
                # Extract content between brackets
                content = cleaned[cleaned.find("[")+1:cleaned.rfind("]")]
                # Split by commas and clean up each item
                items = [item.strip().strip("'\"") for item in content.split(",")]
                return [item for item in items if item]
            else:
                # Last resort - split by newlines
                items = [line.strip().strip("'\"- ") for line in cleaned.split("\n")]
                return [item for item in items if item]

In [159]:
#================ ML MODEL
load_dir = "D:\\My_working_area\\Masters\\Semester 2\\NLP804\\Project\\Critical_Question_generation\\models"
model = joblib.load(os.path.join(load_dir, "logistic_model_with_features.joblib"))
vectorizer = joblib.load(os.path.join(load_dir, "tfidf_vectorizer_with_bigrams.joblib"))
scaler = joblib.load(os.path.join(load_dir, "structured_feature_scaler.joblib"))

#--------------Features and prediction of ML
def predict_usefulness(intervention, question, features_dict):
    text_input = intervention + " " + question
    X_text_vec = vectorizer.transform([text_input])
    X_struct = pd.DataFrame([features_dict])
    X_struct_scaled = scaler.transform(X_struct)
    X_combined = hstack([X_text_vec, X_struct_scaled])

    proba = model.predict_proba(X_combined)[0]
    return {
        "Invalid": round(proba[0], 4),
        "Unhelpful": round(proba[1], 4),
        "Useful": round(proba[2], 4)
    }


def compute_features(intervention, question):
    """
    Automatically compute the structured features needed for prediction.
    """
    # Word count & char count
    question_word_count = len(question.split())
    question_char_count = len(question)

    # Word overlap
    intervention_tokens = set(intervention.lower().split())
    question_tokens = set(question.lower().split())
    overlap = len(intervention_tokens & question_tokens)
    word_overlap = overlap / max(len(question_tokens), 1)

    # BM25 similarity approximation using bag-of-words cosine similarity
    vectorizer = CountVectorizer().fit([intervention, question])
    vecs = vectorizer.transform([intervention, question]).toarray()
    cosine_similarity = np.dot(vecs[0], vecs[1]) / (np.linalg.norm(vecs[0]) * np.linalg.norm(vecs[1]) + 1e-10)

    # Max similarity placeholder
    max_similarity = cosine_similarity
    return {
        "question_word_count": question_word_count,
        "question_char_count": question_char_count,
        "bm25_similarity": round(cosine_similarity, 4),
        "word_overlap": round(word_overlap, 4),
        "max_similarity": round(max_similarity, 4)
    }

In [160]:
def reranker(intervention, list_of_cq):
    # Make sure list_of_cq is a list
    if isinstance(list_of_cq, str):
        # If it's a string with newlines, split by newlines
        list_of_cq = list_of_cq.split('\n')
        # Clean up any empty strings or whitespace-only strings
        list_of_cq = [q.strip() for q in list_of_cq if q.strip()]
    
    rankings = []
    for i in list_of_cq:
        features = compute_features(intervention, i)
        x = predict_usefulness(intervention, i, features)
        rankings.append((i, x["Useful"]))
    
    # Sort the rankings list based on the second element (usefulness score)
    # in descending order (from highest to lowest)
    sorted_rankings = sorted(rankings, key=lambda item: item[1], reverse=True)
    
    # Extract just the questions in ranked order
    ranked_questions = [item[0] for item in sorted_rankings]
    
    # Return both the full rankings (with scores) and just the ranked questions
    return sorted_rankings, ranked_questions

### Dataset preparation

In [161]:
import json

# Load the JSON data from the file
with open("prompt_agents.json", "r") as file:
    data = json.load(file)


# Extract each agent's details into individual variables
QC_LLM_generator = data["agents"][0]
QC_LLM_evaluator = data["agents"][1]
QC_LLM_improver_ml = data["agents"][2]
QC_LLM_improver_llm = data["agents"][3]
QC_LLM_improver_both = data["agents"][4]

# Now, each variable holds a dictionary with keys: "name", "description", and "prompt".
# For example, you can print the details of each agent as follows:
print("QC_LLM_generator:", QC_LLM_generator)
print("QC_LLM_evaluator:", QC_LLM_evaluator)
print("QC_LLM_improver_ml:", QC_LLM_improver_ml)
print("QC_LLM_improver_llm:", QC_LLM_improver_llm)
print("QC_LLM_improver_both:", QC_LLM_improver_both)

# Example usage: print generator details individually
print("\nGenerator Details:")
print("Name:", QC_LLM_generator["name"])
print("Description:", QC_LLM_generator["description"])
print("Prompt:", QC_LLM_generator["prompt"])

QC_LLM_improver_ml: {'name': 'Critical Question Improver (ML Rated)', 'description': 'Improves critical questions using ML rating feedback. It analyzes the questions, applies ML feedback, and produces a ranked list of improved questions.', 'prompt': "You are provided with the following:\n- An intervention: [INTERVENTION_PLACEHOLDER]\n- A list of critical questions with their usefulness score from 0-1: [ML_RATING_FEEDBACK_PLACEHOLDER]\n\n\n1. Carefully analyze each question's strengths and weaknesses based on the intervention and ML feedback.\n2. Revise each question to be clearer, more specific, and better at challenging key assumptions, exploring risks, or proposing alternative perspectives.\n3. Avoid vague, obvious, or out-of-scope questions.\n4. Integrate the ML feedback carefully for improvements.\n\nFinally:\n- Return a list of improved questions, ranked from strongest to weakest based on quality and depth.\n- Make sure the questions are open-ended and meaningful for critical anal

In [162]:
import spacy
import json
import pandas as pd

# Load dataset
file_path = "D:\\My_working_area\\Masters\\Semester 2\\NLP804\\Project\\Critical_Question_generation\\data_splits\\testing_dataset.json"

with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Prepare processed data
preprocessed_data = []

# Process each intervention (ignore individual questions)
for intervention_id, content in data.items():
    intervention_text = content.get("intervention", "")
    dataset = content.get("dataset", "")
    scheme_types = set(content.get("schemes", []))

    preprocessed_data.append({
        "intervention_id": intervention_id,
        "intervention": intervention_text,
        "dataset": dataset,
        "schemes": ", ".join(scheme_types)
    })

# Convert to DataFrame and drop duplicates by intervention_id
df = pd.DataFrame(preprocessed_data).drop_duplicates(subset="intervention_id").reset_index(drop=True)

# Preview
df.head()


Unnamed: 0,intervention_id,intervention,dataset,schemes
0,MT_45,"MT: ""Claire’s absolutely right about that\nBut...",moral_maze_schemes,CauseToEffect
1,cd38_220_2,"cd38: ""The return flight is 6 ½ hours (plus co...",rrd,ERExample
2,TRUMP_183,"TRUMP: ""I do want to say that I was just endor...",US2016,"Values, PositionToKnow"
3,AngelComa__638,"AngelComa: ""Funny, I thought he did good\nThey...",us2016reddit,Ad hominem
4,TRUMP_114_2,"TRUMP: ""our country 's a mess\nit 's one thing...",US2016,"ExpertOpinion, VerbalClassification, GenericAd..."


In [163]:
len(df)

38

In [164]:
# df = df.head(5)

In [165]:
import json
from sentence_transformers import SentenceTransformer
import numpy as np
import numpy
from collections import Counter
import sys
import argparse
import logging

from prompts_eval import *
from openai import OpenAI
import re
import os
from dotenv import load_dotenv
logger = logging.getLogger(__name__)
from evaluation import *
def evaluate_results(target_file, result_file_name):
    golden_path = "D:\\My_working_area\\Masters\\Semester 2\\NLP804\\Project\\Critical_Question_generation\\data_splits\\testing_dataset.json"
    print("Starting", target_file)
    result  = eval_func(threshold = 0.6, golden_path=  golden_path, submission_path = target_file)
    with open(result_file_name, 'w') as o:
        json.dump(result, o, indent=4)

## Approach 1: Mirroring Agents and Evaluate with LLM only
<img src="images\approach_ml_eval_only.png" alt="image" width="200">

In [None]:

results = {}
results_to_analyze = []


for i, row in df.iterrows(): 
    
    for j in range(1,3):
        target_file = "critical_questions/Approach_1_ML_evaluator_llama405"+j+".json"
        result_file_name_evaluation = "evaluations/Evaluation_approach_1_ML_evaluator_llama405"+j+".json"
        print("QUESTION=======",i)
        # Step 1: Generate questions
        gen_config = build_config(QC_LLM_generator, row)
        generated_questions, prompt_generator = query_model(gen_config)
        
        # Step 2: Evaluate those questions using LLM/ML
        # Example use
        intervention = row["intervention"]
        question = generated_questions

        list_of_cq = extract_CQ(question)
        # Format the list into a prompt-friendly block
        

        CQs_score,_= reranker(intervention, list_of_cq)
        formatted_scores = "\n".join(
            [f"{i+1}. {question.strip()}  \n   → Score: {score:.4f}" for i, (question, score) in enumerate(CQs_score)]
        )

        # eval_config = build_config(QC_LLM_evaluator, row, {
        #     "[QUESTIONS_PLACEHOLDER]": generated_questions
        # })
        # eval_feedback = query_model(eval_config)

        # Step 3: Improve questions with LLM feedback
        improve_config = build_config(QC_LLM_improver_ml, row, {
            "[ML_RATING_FEEDBACK_PLACEHOLDER]": formatted_scores
        })
        improved_questions,prompt_improver = query_model(improve_config)

        #Step 4: Extract the solution from the problem
        list_answer = extract_CQ(improved_questions)
        #Step 5: Rerank the Questions
        list_ordered_score, list_ordered_cq = reranker(intervention,list_answer)
        # Step 6: Save all results along with the intervention
        concatenated_answer = "\n".join([f"- {question}" for question in list_ordered_cq[:3]])
        
        results_to_analyze.append({
            "intervention_id": row["intervention_id"],
            "intervention": row["intervention"],
            "prompt_generator": prompt_generator,
            "prompt_improver": prompt_improver,
            "generated_questions": generated_questions,
            "ML_results": formatted_scores,
            "improved_questions": improved_questions,
            "List_of_questions": list_answer,
            "list_of_questions_order_score": list_ordered_score,
            "list_of_questions_order":list_ordered_cq,
            "concatenated_answer": concatenated_answer
        })

        results[row["intervention_id"]] = {
        "intervention_id": row["intervention_id"],
        "intervention": row["intervention"],
        "dataset": row["dataset"],
        "schemes": [scheme.strip() for scheme in row["schemes"].split(",")],
        "cqs": [
            {"id": 0, "cq": list_ordered_cq[0]},
            {"id": 1, "cq": list_ordered_cq[1]},
            {"id": 2, "cq": list_ordered_cq[2]},
        ],
        "full_response": concatenated_answer
        }
    with open(target_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    f="double_check/approach_1_ML_evaluator_llama405"+j+".json"
    with open(f, "w", encoding="utf-8") as f:
        json.dump(results_to_analyze, f, ensure_ascii=False, indent=2)
    print("✅ Results saved")
    evaluate_results(target_file, result_file_name_evaluation)

✅ Results saved
Starting critical_questions/Approach_1_ML_evaluator_llama405.json


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.58it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.89it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.95it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.12it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.75it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.96it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.46it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.93it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.99it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
Batches: 1

Similarity below threshold for 'What are the limitations of relying solely on a website to communicate a politician's plan, and how might this approach exacerbate existing issues of accessibility, transparency, and accountability in democratic processes?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.54it/s]
Batches: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.64it/s]
Batches: 100%|██████████| 2/2 [00:02<00:00,  1.46s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.86it/s]
Batches: 100%|██████████| 2/2 [00:02<00:00,  1.50s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.89it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.31it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.58it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.27it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


Similarity below threshold for 'How does the text support or challenge the claim that the credit card company's advertisement is irresponsible, and what are the key points of contention?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.01it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


Similarity below threshold for 'How does the speaker's possession of the letter and its contents support their claim that the advertisement is irresponsible, and what specific aspects of the letter are most relevant to this claim?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.86it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.21it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.48it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.61it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Batches: 1

Similarity below threshold for 'What are the potential risks and unintended consequences of relying on Arab and Kurdish partners to defeat ISIS in Raqqa, and how will these risks be mitigated?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.69it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.21it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.05it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.30it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.76s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.10it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.40it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.77s/it]


Similarity below threshold for 'How does the speaker’s understanding of "advantage" influence their concern about taking advantage of people, and what are the implications of this understanding for the proposed solution?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00, 10.20it/s]
Batches: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.98it/s]
Batches: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.76it/s]
Batches: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.11it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.01it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.69s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.42it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.49it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
Batches: 1

Similarity below threshold for 'What specific policy changes or decisions, prior to the economic crisis, have been empirically proven to have directly contributed to the crisis, and how do these findings inform the claim that returning to those policies would have a similar effect?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]


Similarity below threshold for 'What economic indicators, data, or research support the assertion that the United States is on the precipice of having a potentially much better economy, and how do these indicators address potential risks or challenges to this outlook?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  9.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.76it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.25it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.99it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.20s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.45it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.27it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.69s/it]
Batches: 1

Similarity below threshold for 'How might the author's involvement with online support forums and their willingness to work with the decision-making process influence their argument, and what steps can be taken to ensure that the decision-making process is transparent, inclusive, and unbiased?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.88it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


Similarity below threshold for 'What are the potential consequences of using physical presence and contribution to society as the sole criteria for determining access to coverage, and how might this approach impact the broader social contract?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.18it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


Similarity below threshold for 'How might the message be strengthened or weakened by addressing potential counterarguments or concerns about the feasibility and fairness of providing coverage to non-citizens, and what are the implications of ignoring or dismissing these concerns?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.34it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


Similarity below threshold for 'How might the tone and language used in the message, particularly the use of profanity, influence the reception and credibility of the argument, and what are the implications of using such language in public discourse?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  9.20it/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.26it/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.97it/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.49it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.96it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Similarity below threshold for 'How does Trump's ability to negotiate with countries like Japan and Saudi Arabia demonstrate his own qualifications for the presidency, and what specific skills or experiences does he bring to these negotiations that would be relevant to the office?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  6.96it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.92s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.69it/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.56it/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.96it/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.33it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]


Similarity below threshold for 'How will Clinton's plans to increase taxes on the wealthy and close corporate loopholes affect the middle class and small business owners, particularly in terms of potential changes to employment rates, economic growth, and income inequality?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.43it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.53it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.06it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.38it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]


Similarity below threshold for 'Does the text's emphasis on the EpiPen's limitations as a safety measure overlook potential alternative emergency response strategies, such as improved air filtration systems or enhanced flight crew training, and what are the feasibility and potential impact of these alternatives?'. Using LLM fallback evaluation.
LLM fallback label: useful


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.23it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.60it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.48it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.16it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


Similarity below threshold for 'Is Trump's criticism of Secretary Clinton and other politicians for not addressing economic issues earlier fair, considering the complexities of economic policy, the role of various factors in shaping the economy, and the trade-offs involved in policy decisions? What are the strengths and limitations of this criticism?'. Using LLM fallback evaluation.
LLM fallback label: useful
Distribution of the labels: Counter({'Useful': 82, 'LLM_useful': 15, 'Unhelpful': 14, 'Invalid': 3})
Distribution of the intervention punctuation: Counter({1.0: 25, 0.6666666666666666: 9, 0.3333333333333333: 4})
Overall punctuation 0.8508771929824562


In [None]:
import pandas as pd
import json

# Load your dataset (update path if necessary)
df = pd.read_json("data/your_input_file.json")

# Initialize containers
final_results = {}
final_results_to_analyze = []
num_iterations = 2  # You can set to 2, 3, 4, etc.

# Main loop
for i, row in df.iterrows(): 
    print(f"🔁 Processing intervention {i} - ID: {row['intervention_id']}")

    gen_config = build_config(QC_LLM_generator, row)
    generated_questions, prompt_generator = query_model(gen_config)
    current_questions = extract_CQ(generated_questions)

    for iteration in range(1, num_iterations + 1):
        print(f"  ➤ Round {iteration} refinement")

        intervention = row["intervention"]
        CQs_score, _ = reranker(intervention, current_questions)
        formatted_scores = "\n".join([
            f"{i+1}. {question.strip()}  \n   → Score: {score:.4f}"
            for i, (question, score) in enumerate(CQs_score)
        ])

        improve_config = build_config(QC_LLM_improver_ml, row, {
            "[ML_RATING_FEEDBACK_PLACEHOLDER]": formatted_scores,
            "[QUESTIONS_PLACEHOLDER]": "\n".join(current_questions)
        })
        improved_questions, prompt_improver = query_model(improve_config)
        current_questions = extract_CQ(improved_questions)

    list_ordered_score, list_ordered_cq = reranker(intervention, current_questions)
    concatenated_answer = "\n".join([f"- {question}" for question in list_ordered_cq[:3]])

    final_results_to_analyze.append({
        "intervention_id": row["intervention_id"],
        "intervention": row["intervention"],
        "prompt_generator": prompt_generator,
        "prompt_improver": prompt_improver,
        "generated_questions": generated_questions,
        "ML_results": formatted_scores,
        "improved_questions": improved_questions,
        "List_of_questions": current_questions,
        "list_of_questions_order_score": list_ordered_score,
        "list_of_questions_order": list_ordered_cq,
        "concatenated_answer": concatenated_answer,
        "final_round": num_iterations
    })

    final_results[row["intervention_id"]] = {
        "intervention_id": row["intervention_id"],
        "intervention": row["intervention"],
        "dataset": row["dataset"],
        "schemes": [scheme.strip() for scheme in row["schemes"].split(",")],
        "cqs": [
            {"id": 0, "cq": list_ordered_cq[0]},
            {"id": 1, "cq": list_ordered_cq[1]},
            {"id": 2, "cq": list_ordered_cq[2]}
        ],
        "full_response": concatenated_answer
    }

# Save final results
with open(f"critical_questions/final_output_{num_iterations}iters.json", "w", encoding="utf-8") as f:
    json.dump(final_results, f, ensure_ascii=False, indent=2)

with open(f"double_check/final_analysis_{num_iterations}iters.json", "w", encoding="utf-8") as f:
    json.dump(final_results_to_analyze, f, ensure_ascii=False, indent=2)

# Run evaluation
evaluate_results(
    f"critical_questions/final_output_{num_iterations}iters.json",
    f"evaluations/final_evaluation_{num_iterations}iters.json"
)
