# Mirroring with Agents

### Main Components for the experiment

In [140]:
import joblib
import pandas as pd
from scipy.sparse import hstack
import os
from dotenv import load_dotenv
import os
from openai import OpenAI
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

load_dotenv()
api_key = "ge.."
client = OpenAI(api_key=api_key)

In [141]:
# This are the prompts for each LLM we have
ALL_PLACEHOLDERS = [
    "[INTERVENTION_PLACEHOLDER]",
    "[QUESTIONS_PLACEHOLDER]",
    "[ML_RATING_FEEDBACK_PLACEHOLDER]",
    "[LLM_FEEDBACK_PLACEHOLDER]"
]
def build_config(agent_dict, row, extra_placeholders={}):
    # Start with empty values for all placeholders
    placeholders = {ph: "" for ph in ALL_PLACEHOLDERS}

    # Always add intervention
    placeholders["[INTERVENTION_PLACEHOLDER]"] = row["intervention"]

    # Add whatever else is needed from the pipeline
    for key, val in extra_placeholders.items():
        if key in placeholders:
            placeholders[key] = val

    config = {
        "agent_type": agent_dict["name"],
        "prompt": agent_dict["prompt"],
        "description": agent_dict["description"],
        "intervention": row["intervention"],
        "placeholders": placeholders
    }
    return config


In [142]:
#================ QUERYING THE MODEL

def query_model(config):
    openai = OpenAI(
        api_key=api_key,
        base_url="https://api.deepinfra.com/v1/openai",
    )
    agent_type = config["agent_type"]
    prompt = config["prompt"]
    description = config["description"]
    intervention = config["intervention"]
    placeholders = config["placeholders"]

    # Always enforce the intervention placeholder to be accurate
    placeholders["[INTERVENTION_PLACEHOLDER]"] = intervention

    # Replace placeholders in prompt
    for placeholder, value in placeholders.items():
        prompt = prompt.replace(placeholder, value or f"<MISSING {placeholder}>")
    messages = [
    {"role": "system", "content": description},
    {"role": "user", "content": prompt}
    ]
    # Send request
    chat_completion = openai.chat.completions.create(
        model= "meta-llama/Meta-Llama-3.1-405B-Instruct",
        messages=messages,
        temperature=0.0
    )
    response = chat_completion.choices[0].message.content

    return response, prompt




In [143]:
from openai import OpenAI
api_key2 = os.getenv("OPENAI_API_KEY")
client2 = OpenAI(api_key=api_key2)
def extract_CQ(response):
    
    # Send request
    prompt = f"Extract the 9 critical questions from the text below, returning only a Python list of the questions without numbering or formatting. For example, if the text contains '1. What is your name?' just include 'What is your name?' in the list.\n\nText: {response}"

    response = client2.chat.completions.create(
        messages=[
            {"role": "user", "content": prompt}
        ],
        model="o4-mini-2025-04-16"
    )
    
    # Get the string response
    response_text = response.choices[0].message.content
    
    # Try to parse it as a Python list
    try:
        # Handle the case where the model returns a properly formatted list
        import ast
        return ast.literal_eval(response_text)
    except (SyntaxError, ValueError):
        # If parsing fails, do some cleanup and try again
        # Remove common formatting issues
        cleaned = response_text.strip()
        if cleaned.startswith("```python"):
            cleaned = cleaned.split("```python", 1)[1]
        if cleaned.startswith("```"):
            cleaned = cleaned.split("```", 1)[1]
        if cleaned.endswith("```"):
            cleaned = cleaned.rsplit("```", 1)[0]
        
        cleaned = cleaned.strip()
        
        try:
            # Try parsing again
            return ast.literal_eval(cleaned)
        except (SyntaxError, ValueError):
            # If still failing, split by newlines or other common separators
            if "[" in cleaned and "]" in cleaned:
                # Extract content between brackets
                content = cleaned[cleaned.find("[")+1:cleaned.rfind("]")]
                # Split by commas and clean up each item
                items = [item.strip().strip("'\"") for item in content.split(",")]
                return [item for item in items if item]
            else:
                # Last resort - split by newlines
                items = [line.strip().strip("'\"- ") for line in cleaned.split("\n")]
                return [item for item in items if item]

In [144]:
#================ ML MODEL
load_dir = "D:\\My_working_area\\Masters\\Semester 2\\NLP804\\Project\\Critical_Question_generation\\models"
model = joblib.load(os.path.join(load_dir, "logistic_model_with_features.joblib"))
vectorizer = joblib.load(os.path.join(load_dir, "tfidf_vectorizer_with_bigrams.joblib"))
scaler = joblib.load(os.path.join(load_dir, "structured_feature_scaler.joblib"))

#--------------Features and prediction of ML
def predict_usefulness(intervention, question, features_dict):
    text_input = intervention + " " + question
    X_text_vec = vectorizer.transform([text_input])
    X_struct = pd.DataFrame([features_dict])
    X_struct_scaled = scaler.transform(X_struct)
    X_combined = hstack([X_text_vec, X_struct_scaled])

    proba = model.predict_proba(X_combined)[0]
    return {
        "Invalid": round(proba[0], 4),
        "Unhelpful": round(proba[1], 4),
        "Useful": round(proba[2], 4)
    }


def compute_features(intervention, question):
    """
    Automatically compute the structured features needed for prediction.
    """
    # Word count & char count
    question_word_count = len(question.split())
    question_char_count = len(question)

    # Word overlap
    intervention_tokens = set(intervention.lower().split())
    question_tokens = set(question.lower().split())
    overlap = len(intervention_tokens & question_tokens)
    word_overlap = overlap / max(len(question_tokens), 1)

    # BM25 similarity approximation using bag-of-words cosine similarity
    vectorizer = CountVectorizer().fit([intervention, question])
    vecs = vectorizer.transform([intervention, question]).toarray()
    cosine_similarity = np.dot(vecs[0], vecs[1]) / (np.linalg.norm(vecs[0]) * np.linalg.norm(vecs[1]) + 1e-10)

    # Max similarity placeholder
    max_similarity = cosine_similarity
    return {
        "question_word_count": question_word_count,
        "question_char_count": question_char_count,
        "bm25_similarity": round(cosine_similarity, 4),
        "word_overlap": round(word_overlap, 4),
        "max_similarity": round(max_similarity, 4)
    }

In [145]:
def reranker(intervention, list_of_cq):
    # Make sure list_of_cq is a list
    if isinstance(list_of_cq, str):
        # If it's a string with newlines, split by newlines
        list_of_cq = list_of_cq.split('\n')
        # Clean up any empty strings or whitespace-only strings
        list_of_cq = [q.strip() for q in list_of_cq if q.strip()]
    
    rankings = []
    for i in list_of_cq:
        features = compute_features(intervention, i)
        x = predict_usefulness(intervention, i, features)
        rankings.append((i, x["Useful"]))
    
    # Sort the rankings list based on the second element (usefulness score)
    # in descending order (from highest to lowest)
    sorted_rankings = sorted(rankings, key=lambda item: item[1], reverse=True)
    
    # Extract just the questions in ranked order
    ranked_questions = [item[0] for item in sorted_rankings]
    
    # Return both the full rankings (with scores) and just the ranked questions
    return sorted_rankings, ranked_questions

### Dataset preparation

In [146]:
import json

# Load the JSON data from the file
with open("prompt_agents.json", "r") as file:
    data = json.load(file)


# Extract each agent's details into individual variables
QC_LLM_generator = data["agents"][0]
QC_LLM_evaluator = data["agents"][1]
QC_LLM_improver_ml = data["agents"][2]
QC_LLM_improver_llm = data["agents"][3]
QC_LLM_improver_both = data["agents"][4]

# Now, each variable holds a dictionary with keys: "name", "description", and "prompt".
# For example, you can print the details of each agent as follows:
print("QC_LLM_generator:", QC_LLM_generator)
print("QC_LLM_evaluator:", QC_LLM_evaluator)
print("QC_LLM_improver_ml:", QC_LLM_improver_ml)
print("QC_LLM_improver_llm:", QC_LLM_improver_llm)
print("QC_LLM_improver_both:", QC_LLM_improver_both)

# Example usage: print generator details individually
print("\nGenerator Details:")
print("Name:", QC_LLM_generator["name"])
print("Description:", QC_LLM_generator["description"])
print("Prompt:", QC_LLM_generator["prompt"])

QC_LLM_improver_ml: {'name': 'Critical Question Improver (ML Rated)', 'description': 'Improves critical questions using ML rating feedback. It analyzes the questions, applies ML feedback, and produces a ranked list of improved questions.', 'prompt': "You are provided with the following:\n- An intervention: [INTERVENTION_PLACEHOLDER]\n- A list of critical questions with their usefulness score from 0-1: [ML_RATING_FEEDBACK_PLACEHOLDER]\n\n\n1. Carefully analyze each question's strengths and weaknesses based on the intervention and ML feedback.\n2. Revise each question to be clearer, more specific, and better at challenging key assumptions, exploring risks, or proposing alternative perspectives.\n3. Avoid vague, obvious, or out-of-scope questions.\n4. Integrate the ML feedback carefully for improvements.\n\nFinally:\n- Return a list of improved questions, ranked from strongest to weakest based on quality and depth.\n- Make sure the questions are open-ended and meaningful for critical anal

In [147]:
import spacy
import json
import pandas as pd

# Load dataset
file_path = "D:\\My_working_area\\Masters\\Semester 2\\NLP804\\Project\\Critical_Question_generation\\data_splits\\testing_dataset.json"

with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Prepare processed data
preprocessed_data = []

# Process each intervention (ignore individual questions)
for intervention_id, content in data.items():
    intervention_text = content.get("intervention", "")
    dataset = content.get("dataset", "")
    scheme_types = set(content.get("schemes", []))

    preprocessed_data.append({
        "intervention_id": intervention_id,
        "intervention": intervention_text,
        "dataset": dataset,
        "schemes": ", ".join(scheme_types)
    })

# Convert to DataFrame and drop duplicates by intervention_id
df = pd.DataFrame(preprocessed_data).drop_duplicates(subset="intervention_id").reset_index(drop=True)

# Preview
df.head()


Unnamed: 0,intervention_id,intervention,dataset,schemes
0,MT_45,"MT: ""Claire’s absolutely right about that\nBut...",moral_maze_schemes,CauseToEffect
1,cd38_220_2,"cd38: ""The return flight is 6 ½ hours (plus co...",rrd,ERExample
2,TRUMP_183,"TRUMP: ""I do want to say that I was just endor...",US2016,"PositionToKnow, Values"
3,AngelComa__638,"AngelComa: ""Funny, I thought he did good\nThey...",us2016reddit,Ad hominem
4,TRUMP_114_2,"TRUMP: ""our country 's a mess\nit 's one thing...",US2016,"VerbalClassification, Example, CircumstantialA..."


In [148]:
len(df)

38

In [149]:
# df = df.head(5)

In [150]:
import json
from sentence_transformers import SentenceTransformer
import numpy as np
import numpy
from collections import Counter
import sys
import argparse
import logging

from prompts_eval import *
from openai import OpenAI
import re
import os
from dotenv import load_dotenv
logger = logging.getLogger(__name__)
from evaluation import *
def evaluate_results(target_file, result_file_name):
    golden_path = "D:\\My_working_area\\Masters\\Semester 2\\NLP804\\Project\\Critical_Question_generation\\data_splits\\testing_dataset.json"
    print("Starting", target_file)
    result  = eval_func(threshold = 0.6, golden_path=  golden_path, submission_path = target_file)
    with open(result_file_name, 'w') as o:
        json.dump(result, o, indent=4)

## Approach 2: Mirroring Agents and Evaluate with ML model only
<img src="images\approach_llm_eval_only.png" alt="image" width="200">

In [None]:

results = {}
results_to_analyze = []

target_file = "critical_questions/Approach_2_LLM_evaluator_llama405.json"
result_file_name_evaluation = "evaluations/Evaluation_approach_2_llm_evaluator_llama405.json"
for i, row in df.iterrows(): 
    print("QUESTION=======",i)
    # Step 1: Generate questions
    gen_config = build_config(QC_LLM_generator, row)
    generated_questions,prompt_generator = query_model(gen_config)
    
    # Step 2: Evaluate those questions using LLM/ML
    # Example use
    intervention = row["intervention"]
    question = generated_questions
    list_of_cq = extract_CQ(question)
    formatted_list_cq =  "\n".join([f"{i+1}. {q}" for i, q in enumerate(list_of_cq)])
    # computed_features = compute_features(intervention, question)
    # ML_result = predict_usefulness(intervention, question, computed_features)

    eval_config = build_config(QC_LLM_evaluator, row, {
        "[QUESTIONS_PLACEHOLDER]": formatted_list_cq
    })
    eval_feedback, prompt_evaluator = query_model(eval_config)

    # Step 3: Improve questions with LLM feedback
    improve_config = build_config(QC_LLM_improver_llm, row, {
        "[QUESTIONS_PLACEHOLDER]": formatted_list_cq,
        "[LLM_FEEDBACK_PLACEHOLDER]": eval_feedback
    })
    improved_questions, prompt_improver = query_model(improve_config)

    #Step 4: Extract the solution from the problem
    list_answer = extract_CQ(improved_questions)
    #Step 5: Rerank the Questions
    list_ordered_score, list_ordered_cq = reranker(intervention,list_answer)
    # Step 6: Save all results along with the intervention
    concatenated_answer = "\n".join([f"- {question}" for question in list_ordered_cq[:3]])
    
    results_to_analyze.append({
        "intervention_id": row["intervention_id"],
        "intervention": row["intervention"],
        "prompt_generator": prompt_generator,
        "prompt_evaluator": prompt_evaluator ,
        "prompt_improver": prompt_improver,
        "generated_questions": generated_questions,
        "LLM_feedback":eval_feedback,
        "improved_questions": improved_questions,
        "List_of_questions": list_answer,
        "list_of_questions_order_score": list_ordered_score,
        "list_of_questions_order":list_ordered_cq,
        "concatenated_answer": concatenated_answer
    })

    results[row["intervention_id"]] = {
    "intervention_id": row["intervention_id"],
    "intervention": row["intervention"],
    "dataset": row["dataset"],
    "schemes": [scheme.strip() for scheme in row["schemes"].split(",")],
    "cqs": [
        {"id": 0, "cq": list_ordered_cq[0]},
        {"id": 1, "cq": list_ordered_cq[1]},
        {"id": 2, "cq": list_ordered_cq[2]},
    ],
    "full_response": concatenated_answer
    }
with open(target_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
with open("double_check/approach_2_LLM_evaluator_llama405.json", "w", encoding="utf-8") as f:
    json.dump(results_to_analyze, f, ensure_ascii=False, indent=2)
print("✅ Results saved")
evaluate_results(target_file, result_file_name_evaluation)




✅ Results saved
Starting critical_questions/Approach_2_LLM_evaluator_llama405.json


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.14it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.50it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.05it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.56it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.69it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.89it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.18it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.89s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.54it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.78s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.18it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.66s/it]

Distribution of the labels: Counter({'Useful': 9})
Distribution of the intervention punctuation: Counter({1.0: 3})
Overall punctuation 1.0



