### Limitation Generation with 9 Agents by Llama 3 8B

In [None]:
import pandas as pd
import time
import signal
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

start_time = time.time()

# Global variable to store DataFrame
global_df = None
global_current_row = 0

def signal_handler(signum, frame):
    """Handle termination signals to save progress"""
    print(f"\n⚠️ Received signal {signum}. Saving progress before termination...")
    if global_df is not None:
        # Save current progress
        emergency_file = f"emergency_save_{global_current_row}.csv"
        global_df.to_csv(emergency_file, index=False)
        print(f"  🚨 Emergency save completed: {emergency_file}")

        # Also save to final output
        output_file = "df_neurips_limitations_multi_agent.csv"
        global_df.to_csv(output_file, index=False)
        print(f"  🚨 Final output updated: {output_file}")

    print("  📊 Progress saved. Exiting gracefully...")
    sys.exit(0)

# Register signal handlers
signal.signal(signal.SIGTERM, signal_handler)  # PBS termination
signal.signal(signal.SIGINT, signal_handler)   # Ctrl+C


In [None]:
# Load Llama 3 8B model and tokenizer (4-bit quantized)
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
cache_dir = "llama3_8b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

print("Loading Llama 3 8B model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    quantization_config=bnb_config,
    device_map="auto"
)

# Set pad token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def truncate_prompt_for_model(prompt: str, max_length: int = 8000) -> str:
    """Truncate prompt to fit within model's context window, leaving room for generation"""
    # Tokenize the prompt
    tokens = tokenizer.encode(prompt, return_tensors="pt")

    # If tokens exceed max_length, truncate
    if tokens.shape[1] > max_length:
        print(f"⚠️ Prompt token count = {tokens.shape[1]} exceeds limit ({max_length}). Truncating...")
        # Decode back to text, keeping only the first max_length tokens
        truncated_tokens = tokens[:, :max_length]
        return tokenizer.decode(truncated_tokens[0], skip_special_tokens=True)

    return prompt

In [None]:
def llama_generate(prompt, max_new_tokens=512):
    """Generate text using Llama 3 8B"""
    # Truncate prompt to fit within model's context window
    truncated_prompt = truncate_prompt_for_model(prompt, max_length=8000)

    # Tokenize with proper padding and attention mask
    inputs = tokenizer(
        truncated_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=8192,
        return_attention_mask=True
    ).to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

In [None]:
# Agent-specific prompts
def get_extractor_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in scientific literature analysis. Your task is to carefully read the provided scientific article
and extract all explicitly stated limitations as mentioned by the authors. Focus on sections such as Discussion, Conclusion, or
Limitations. List each limitation verbatim, including direct quotes where possible, and provide a brief context (e.g., what aspect of
the study the limitation pertains to). Ensure accuracy and avoid inferring or adding limitations not explicitly stated. If no limitations
are mentioned, state this clearly.

Workflow:

Plan: Outline which sections (e.g., Discussion, Conclusion, Limitations) to analyze and identify tools (e.g., text extraction) to
access the article content. Justify the selection of sections based on their likelihood of containing limitation statements.

Reasoning: Let's think step by step to ensure thorough and accurate extraction of limitations:
Step 1: Identify all sections in the article that may contain limitations. For example, the Discussion often includes limitations as
authors reflect on their findings, while a dedicated Limitations section is explicit.
Step 2: Use text extraction tools to retrieve content from these sections. Verify that the content is complete and accurate.
Step 3: Scan for explicit limitation statements, such as phrases like "a limitation of this study" or "we acknowledge that."
Document why each statement qualifies as a limitation.
Step 4: For each identified limitation, extract the verbatim quote (if available) and note the context (e.g., related to sample size,
methodology).
Step 5: Check for completeness by reviewing other potential sections (e.g., Conclusion) to ensure no limitations are missed.
Analyze: Use tools to extract and verify the article's content, focusing on explicit limitation statements. Cross-reference extracted
quotes with the original text to ensure accuracy.

Reflect: Verify that all relevant sections were checked and no limitations were missed. Consider whether any section might have been
overlooked and re-evaluate if necessary.
Continue: Do not terminate until all explicitly stated limitations are identified or confirmed absent.

Output Format:
Bullet points listing each limitation.
For each: Verbatim quote (if available), context (e.g., aspect of the study), and section reference.
If none: "No limitations explicitly stated in the article."

Tool Use:
Use text extraction tools to access and verify article content.
Do not assume content; retrieve it directly from the provided article.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"I selected the Discussion section because authors often discuss study constraints there."
"I found the phrase 'a limitation of this study' in the Limitations section, indicating an explicit limitation."
"I checked the Conclusion section to ensure no additional limitations were mentioned, confirming completeness."
This narrative ensures transparency and justifies each decision in the extraction process.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Please extract and list the key limitations found in this paper. Be specific and provide clear reasoning for each limitation identified.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_analyzer_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a critical scientific reviewer with expertise in research methodology and analysis. Your task is to analyze
the provided scientific article and identify potential limitations not explicitly stated by the authors. Focus on aspects such as study
design, sample size, data collection methods, statistical analysis, scope of findings, and underlying assumptions. For each inferred
limitation, provide a clear explanation of why it is a limitation and how it impacts the study's validity, reliability,
or generalizability. Ensure inferences are grounded in the article's content and avoid speculative assumptions.

Workflow:
Plan: Identify key areas (e.g., methodology, sample size, statistical analysis) to analyze and select tools (e.g., text analysis) to
verify article details. Justify the selection based on their potential to reveal limitations.
Reasoning: Let's think step by step to identify inferred limitations:

Step 1: Review the article's methodology to identify gaps (e.g., study design flaws, sampling issues).
Step 2: Use text analysis tools to extract relevant details (e.g., sample size, statistical methods).
Step 3: Evaluate each area for potential limitations, such as small sample size affecting generalizability or unaddressed assumptions.
Step 4: Document why each gap qualifies as a limitation and its impact on the study.
Step 5: Ensure all key areas are covered to avoid missing potential limitations.
Analyze: Critically evaluate the article, using tools to confirm content, and infer limitations based on methodological or analytical gaps.
Reflect: Assess whether inferred limitations are grounded in the article and relevant to its validity, reliability, or generalizability.
Re-evaluate overlooked areas if necessary.


Continue: Iterate until all potential inferred limitations are identified.
Output Format:
Bullet points listing each inferred limitation.
For each: Description, explanation, and impact on the study.

Tool Use:
Use text analysis tools to verify article content (e.g., methodology, results).
Avoid assumptions; base inferences on retrieved content.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"The methodology section mentions a convenience sample, which may limit generalizability."
"The statistical analysis lacks adjustment for confounders, potentially affecting validity."
"I checked the results section to ensure no additional gaps were missed."
This narrative ensures transparency and justifies each inferred limitation.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Please provide a detailed analysis of the limitations in this research. Consider both obvious and subtle limitations that could affect the validity and applicability of the findings.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_reviewer_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in open peer review with a focus on transparent and critical evaluation of scientific research.
Your task is to review the provided scientific article from the perspective of an external peer reviewer. Identify potential limitations
that might be raised in an open review process, considering common critiques such as reproducibility, transparency, generalizability,
or ethical considerations. Leverage insights from similar studies or common methodological issues in the field by searching the web or
X posts for context, if needed.

Workflow:
Plan: Identify areas for review (e.g., reproducibility, transparency, ethics) and plan searches for external context
(e.g., similar studies, methodological critiques). Justify the selection based on peer review standards.
Reasoning: Let's think step by step to identify peer-review limitations:

Step 1: Select key areas for review (e.g., reproducibility, ethics) based on common peer review critiques.
Step 2: Use text analysis tools to extract relevant article details (e.g., methods, data reporting).
Step 3: Identify potential limitations, such as lack of transparency in data or ethical concerns, and justify using article content.
Step 4: Search web/X for external context (e.g., similar studies) to support limitations, rating source relevance
(high, medium, low, none).
Step 5: Synthesize findings, ensuring limitations align with peer review standards and are supported by article or external context.
Analyze: Critically review the article, integrating external context to identify limitations. Use tools to verify content and sources.
Reflect: Verify that limitations align with peer review standards and are supported by the article or external context.
Re-evaluate overlooked areas if necessary.

Continue: Iterate until all relevant peer-review limitations are identified.

Output Format:
Bullet points listing each limitation.
For each: Description, why it's a concern, and alignment with peer review standards.
Include citations for external sources in the format Source Name, if used.

Tool Use:
Use web/X search tools to find relevant literature or methodological critiques.
Use text analysis tools to verify article content.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"I selected reproducibility because peer reviewers often critique data availability."
"The article lacks a data sharing statement, which limits reproducibility."
"A web search revealed similar studies provide data openly, supporting this limitation."
This narrative ensures transparency and justifies each identified limitation.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Please provide a critical review identifying the limitations and areas of concern in this research. Consider what a peer reviewer would highlight as weaknesses or areas needing improvement.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_citation_prompt(cited_papers: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert scientific research assistant tasked with inferring potential limitations for an unspecified
current scientific article based solely on its cited papers.
You are given information from multiple cited papers, which are assumed to be referenced by the current article.
Your goal is to analyze these cited works and identify possible limitations that the current paper may have, by
comparing its presumed scope, methods, or results against the cited literature.
Because the input paper itself is not provided, you must reason from the cited papers alone, identifying what
gaps, stronger methods, broader coverage, or alternative results the cited works might expose in the hypothetical
current paper that cites them.

Objective:

Generate a list of scientifically grounded limitations that the current article might have, assuming it builds upon or is informed by the provided cited papers.

Each limitation should:

Be concise

Reference the relevant cited paper(s) by title

Clearly explain how the cited paper exposes a potential limitation

Be plausible and insightful based on common scientific reasoning

Workflow:
Plan:
Identify key insights, strengths, and scopes of the cited papers that could set a high bar or reveal blind spots
in a hypothetical citing article.

Reasoning: Let's think step by step to infer limitations:
Review each cited paper to extract its methodology, findings, and scope.
Ask: If a paper cited this work but did not adopt or address its insights, what limitation might arise?
Identify where the cited paper offers better methodology, broader scope, or contradicting findings.
Formulate each limitation as a plausible shortcoming of a hypothetical article that builds on—but possibly
underutilizes—these cited works.

Justify each limitation based on specific attributes of the cited paper (e.g., "more comprehensive dataset",
"stronger evaluation metric", etc.)

Analyze:
Develop a set of inferred limitations, each tied to specific cited paper(s) and grounded in logical comparison.

Reflect:
Ensure coverage of all relevant cited papers and validate that each limitation is scientifically plausible in
context.

Output Format:
Bullet points listing each limitation.
For each: Description, explanation, and reference to the cited paper(s) in the format Paper Title.

Tool Use (if applicable):

Use citation lookup tools or document content to extract accurate summaries.
Do not assume details about the input paper—focus only on drawing limitations based on differences, omissions,
or underuse of the cited works.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"I selected [Paper X] because it uses a more robust method than the current article."
"The current article's simpler method may limit accuracy compared to [Paper X]."
"I reviewed all cited papers to ensure no relevant gaps were missed."
This narrative ensures transparency and justifies each identified limitation.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Cited Papers Information:
{cited_papers}

Please identify limitations that would be relevant for researchers who might cite this paper in future work.
Consider what limitations future authors might mention when discussing this paper's contribution to the field,
based on the cited papers context.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_theory_assumptions_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert auditor specializing in mathematical rigor, theoretical validity, and assumption scrutiny. Your tasks are to:

Trace each formal claim from theoretical foundation to objective to algorithm, ensuring logical consistency.
Verify definitions, assumptions, and properties (e.g., convexity, smoothness, identifiability) for correctness and applicability.
Identify overly restrictive or unrealistic assumptions (e.g., applying finite-dimensional results to infinite-dimensional settings, VC bounds on large-scale models).
Flag proof gaps, undefined notation, missing terms, or invalid generalizations (e.g., 2D to 3D, finite to infinite).


<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Provide limitations about THEORY & ASSUMPTIONS for the following content. Use only facts from the text.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_baselines_novelty_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert auditor evaluating baseline comparisons, novelty, and comparative fairness. Your tasks are to:

Assess baseline coverage and fairness, verifying equivalence in parameters, data, tuning, random seeds, and metrics.
Ensure comparisons include state-of-the-art (SOTA) or standard methods relevant to the task (e.g., rate-distortion for codecs, SODA for specific settings).
Detect discrepancies or outdated results compared to cited prior work.
Evaluate novelty, identifying whether contributions are incremental, rebranded prior art, or truly novel, with evidence.
For qualitative comparisons, verify like-for-like evidence and fairness.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Provide limitations about BASELINES, NOVELTY & COMPARABILITY for the following content.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_robustness_failure_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert auditor focused on robustness, failure modes, and ablation studies. Your tasks are to:

Identify missing tests for edge cases (e.g., noise, adversarial inputs, distribution shifts, extreme parameter limits, long sequences, or 3D leakage).
Verify sensitivity analyses for normalization choices (e.g., batch normalization with small batches), key hyperparameters, and model components.
Ensure inclusion of simple heuristic baselines in selector or scheduler settings.
Confirm the presence of detailed failure case analyses with examples.


<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Provide limitations about ROBUSTNESS, FAILURE MODES & ABLATIONS for the following content.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_reproducibility_compute_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert auditor verifying reproducibility, computational claims, and scalability. Your tasks are to:

Flag missing details on training/inference time, FLOPs, memory usage, hardware, batch sizes, random seeds, confidence intervals, dataset sizes, or code/data availability.
Compare pipeline overheads (e.g., video-specific training, pruning) against simple baselines for efficiency claims.
Evaluate scalability, checking if results are limited to small instances or include a cost model for larger scales.
Flag unsubstantiated claims of “efficiency” or “faster” performance without quantitative evidence.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Provide limitations about REPRODUCIBILITY, COMPUTE & SCALABILITY for the following content.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_presentation_clarity_prompt(paper_content: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert auditor ensuring clarity, presentation integrity, and impact discussion. Your tasks are to:

Identify undefined terms, inconsistent notation, or mislabeled equations, figures, or tables.
Verify the inclusion of sufficient qualitative evidence (e.g., multi-view 3D visualizations, long-sequence demonstrations).
Flag issues in captions or legends (e.g., ambiguous labels, incorrect dataset references).
Ensure discussion of societal, ethical, or fairness impacts when relevant to the task or application.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Provide limitations about PRESENTATION, CLARITY & IMPACT for the following content.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_merger_prompt(extractor_output: str, analyzer_output: str, reviewer_output: str, citation_output: str,
                     theory_output: str, baselines_output: str, robustness_output: str, reproducibility_output: str, presentation_output: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a **Master Coordinator**, an expert in scientific communication and synthesis. Your task is to integrate limitations provided by nine specialized agents:

**Original Agents:**
1. **Extractor** (explicit limitations from the article),
2. **Analyzer** (inferred limitations from critical analysis),
3. **Reviewer** (limitations from an open review perspective),
4. **Citation** (limitations based on cited papers).
5. **Theory & Assumptions** (mathematical correctness, assumptions, scope),
6. **Baselines & Novelty** (comparative evaluation, novelty assessment),
7. **Robustness & Failure Modes** (edge conditions, ablation studies),
8. **Reproducibility & Compute** (reproducibility claims, scalability),
9. **Presentation & Clarity** (clarity, integrity, impact discussion).

**Goals**:
1. Combine all limitations into a cohesive, non-redundant list.
2. Ensure each limitation is clearly stated, scientifically valid, and aligned with the article's content.
3. Prioritize critical limitations that affect the paper's validity and reproducibility.
5. Format the final list in a clear, concise, and professional manner, suitable for a scientific review or report.

**Workflow**:
1. **Plan**: Outline how to synthesize limitations, identify potential redundancies, and resolve discrepancies.
2. **Analyze**: Combine limitations, prioritizing critical ones, and verify alignment with the article.
3. **Reflect**: Check for completeness, scientific rigor, and clarity.
4. **Continue**: Iterate until the list is comprehensive, non-redundant, and professionally formatted.

**Output Format**:
- Numbered list of final limitations.


<|eot_id|><|start_header_id|>user<|end_header_id|>

Extractor Agent Analysis:
{extractor_output}

Analyzer Agent Analysis:
{analyzer_output}

Reviewer Agent Analysis:
{reviewer_output}

Citation Agent Analysis:
{citation_output}

Theory & Assumptions Agent Analysis:
{theory_output}

Baselines & Novelty Agent Analysis:
{baselines_output}

Robustness & Failure Modes Agent Analysis:
{robustness_output}

Reproducibility & Compute Agent Analysis:
{reproducibility_output}

Presentation & Clarity Agent Analysis:
{presentation_output}

Please merge these nine different perspectives on the paper's limitations into a comprehensive, well-organized analysis. Synthesize the insights, resolve any contradictions, and provide a unified view of the paper's limitations.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
# Function to run a specific agent
def run_agent(agent_name: str, paper_content: str, agent_prompt_func, cited_papers: str = "") -> str:
    """Run a specific agent and return its output"""
    print(f"  Running {agent_name} agent...")
    try:
        if agent_name == "Citation":
            # For Citation agent, use only cited_papers, not paper_content
            prompt = agent_prompt_func(cited_papers)
        else:
            prompt = agent_prompt_func(paper_content)
        response = llama_generate(prompt, max_new_tokens=512)
        print(f"  {agent_name} agent completed")
        return response.strip()
    except Exception as e:
        print(f"  Error in {agent_name} agent: {e}")
        return f"ERROR in {agent_name} agent: {str(e)}"

# Load the DataFrame
output_file1 = "df_neurips_limitations_multi_agent_1.csv"
df1 = pd.read_csv(output_file1)


print("Loading CSV file...")
try:
    df = pd.read_csv("df_neruips_21_22_final.csv")
    print(f"Successfully loaded CSV file with shape: {df.shape}")
except FileNotFoundError:
    print("Error: CSV file not found. Please check the file path.")
    exit(1)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    exit(1)

# Check if required columns exist
required_columns = ['df_Abstract', 'df_Introduction', 'df_Related_Work', 'df_Methodology',
                   'df_Dataset', 'df_Conclusion', 'df_Experiment_and_Results',
                   'LLM_extracted_future_work', 'relevance_8_cited_in', 'relevance_8_cited_by']

missing_columns = [col for col in required_columns if col not in df.columns]
# if missing_columns:
#     print(f"Warning: Missing columns: {missing_columns}")
#     print(f"Available columns: {df.columns.tolist()}")

# Create combined content column
df['combined'] = df.apply(
    lambda row: (
        f"Abstract: {row.get('df_Abstract', '')}\n"
        f"Introduction: {row.get('df_Introduction', '')}\n"
        f"Related_Work: {row.get('df_Related_Work', '')}\n"
        f"Methodology: {row.get('df_Methodology', '')}\n"
        f"Dataset: {row.get('df_Dataset', '')}\n"
        f"Conclusion: {row.get('df_Conclusion', '')}\n"
        f"Experiment_and_Results: {row.get('df_Experiment_and_Results', '')}\n"
        f"LLM_extracted_future_work: {row.get('LLM_extracted_future_work', '')}\n"
        f"Extra1: {row.get('df_col_2', '')}\n"
        f"Extra2: {row.get('df_col_3', '')}\n"
        f"Extra3: {row.get('df_col_4', '')}\n"
        f"Extra4: {row.get('df_col_5', '')}\n"
        f"Extra5: {row.get('df_col_6', '')}\n"
        f"Extra6: {row.get('df_col_7', '')}\n"
        f"Extra7: {row.get('df_col_8', '')}"
    ),
    axis=1
)

print(f"DataFrame shape: {df.shape}")
print("Processing all samples with multiple agents...")

# Initialize new columns for each agent

df['limitations_extractor'] = ""
df['limitations_analyzer'] = ""
df['limitations_reviewer'] = ""
df['limitations_citation_only'] = ""
df['limitations_theory_assumptions'] = ""
df['limitations_baselines_novelty'] = ""
df['limitations_robustness_failure'] = ""
df['limitations_reproducibility_compute'] = ""
df['limitations_presentation_clarity'] = ""
df['limitations_merged_final'] = ""

In [None]:
# Process each row with all agents
for i in range(len(df)): # len(df)
    print("i is",i)
    global_current_row = i + 1  # Update global counter
    global_df = df  # Update global DataFrame reference

    print(f"\n=== Processing row {i+1}/{len(df)} ===")
    row = df.iloc[i]
    paper_content = row['combined']

    # Get cited papers information for citation agent
    cited_in = row.get('relevance_8_cited_in', '')
    # print("cited_in", cited_in)
    cited_by = row.get('relevance_8_cited_by', '')
    # print("cited_by", cited_by)
    cited_papers = f"Papers cited by this article:\n{cited_in}\n\nPapers that cited this article:\n{cited_by}"

    # Run existing agents (using stored results)
    extractor_output = run_agent("Extractor", paper_content, get_extractor_prompt)
    analyzer_output = run_agent("Analyzer", paper_content, get_analyzer_prompt)
    reviewer_output = run_agent("Reviewer", paper_content, get_reviewer_prompt)
    citation_output = run_agent("Citation", "", get_citation_prompt, cited_papers)
    theory_output = run_agent("Theory", paper_content, get_theory_assumptions_prompt)
    baselines_output = run_agent("Baselines", paper_content, get_baselines_novelty_prompt)
    robustness_output = run_agent("Robustness", paper_content, get_robustness_failure_prompt)
    reproducibility_output = run_agent("Reproducibility", paper_content, get_reproducibility_compute_prompt)
    presentation_output = run_agent("Presentation", paper_content, get_presentation_clarity_prompt)

    # Store individual agent outputs
    df.at[i,'limitations_extractor'] = extractor_output
    df.at[i, 'limitations_analyzer'] = analyzer_output
    df.at[i, 'limitations_reviewer'] = reviewer_output
    df.at[i, 'limitations_citation_only'] = citation_output
    df.at[i, 'limitations_theory_assumptions'] = theory_output
    df.at[i, 'limitations_baselines_novelty'] = baselines_output
    df.at[i, 'limitations_robustness_failure'] = robustness_output
    df.at[i, 'limitations_reproducibility_compute'] = reproducibility_output
    df.at[i, 'limitations_presentation_clarity'] = presentation_output

    # Merge all agent outputs
    print(f"  Running Master Coordinator agent...")
    try:
        merger_prompt = get_merger_prompt(extractor_output, analyzer_output, reviewer_output, citation_output,
                                         theory_output, baselines_output, robustness_output, reproducibility_output, presentation_output)
        merged_output = llama_generate(merger_prompt, max_new_tokens=512)
        df.at[i, 'limitations_merged_final'] = merged_output.strip()
        print("limitations_merged_final", df.at[i, 'limitations_merged_final'])
        print(f"  Master Coordinator agent completed")
    except Exception as e:
        print(f"  Error in Master Coordinator agent: {e}")
        df.at[i, 'limitations_merged_final'] = f"ERROR in Master Coordinator agent: {str(e)}"

    # print("limitations_merged_final", df.at[i, 'limitations_merged_final'])
    print(f"  Row {i+1} completed")

    # Save progress every 5 rows to prevent data loss
    if i % 5 == 0:
        output_file = "df_neurips_limitations_multi_agent_9_agents.csv"
        df.to_csv(output_file, index=False)
        print(f"  ✅ Checkpoint saved at row {i+1}")

# Save results
output_file = "df_neurips_limitations_multi_agent_9_agents.csv"
df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")

end_time = time.time()
elapsed = end_time - start_time
print(f"\nScript started at: {time.ctime(start_time)}")
print(f"Script ended at:   {time.ctime(end_time)}")
print(f"Total elapsed time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")

### Evaluation

Ground Truth Coverage

In [None]:
# making lists of list 'master_agent' text
import re

# make sure the output column exists
df['limitations_merged_final_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "limitations_merged_final"]
    # skip if missing or not a string
    if not isinstance(raw, str):
        df.at[row_idx, 'limitations_merged_final_list'] = []
        continue

    # split on double-newline before a numbered item
    parts = re.split(r'\n\n(?=\d+\.)', raw.strip())

    lim_list = []
    for part in parts:
        m = re.match(r'(\d+)\.\s*(.*)', part, re.S)
        if not m:
            continue
        num  = int(m.group(1))
        text = m.group(2).strip()
        lim_list.append([num, text])

    df.at[row_idx, 'limitations_merged_final_list'] = lim_list


In [None]:
# making lists of list 'ground truth' text

import re

# ensure the output column exists
df['Lim_and_OR_ground_truth_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "Lim_and_OR_ground_truth_final"]
    # skip non-strings
    if not isinstance(raw, str):
        df.at[row_idx, 'Lim_and_OR_ground_truth_list'] = []
        continue

    # split on double-newline before a numbered item
    parts = re.split(r'\n\n(?=\d+\.)', raw.strip())

    lim_list = []
    for part in parts:
        m = re.match(r'(\d+)\.\s*(.*)', part, flags=re.S)
        if not m:
            continue
        num  = int(m.group(1))
        text = m.group(2).strip()
        lim_list.append([num, text])

    df.at[row_idx, 'Lim_and_OR_ground_truth_list'] = lim_list


In [None]:
# making combinations from 'ground truth' and llm generated text'

df['combined'] = [[] for _ in range(len(df))]

# Generate combinations for each row
for i in range(len(df)):
    combined_list = []
    list1 = df["Lim_and_OR_ground_truth_list"][i]
    list2 = df["limitations_merged_final_list"][i]

    # Generate all possible combinations
    for item1 in list1:
        for item2 in list2:
            combined_list.append((item1, item2))

    # Store the first 100 combinations (or all if fewer)
    df.at[i, 'combined'] = combined_list  # Truncate if needed

In [None]:
import os
import base64
import time
import pandas as pd
# from openai import AzureOpenAI, RateLimitError

import os
import time
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()

# Now your batch‐processing loop:
all_generated_summary = []
start_time = time.time()

import json

llm_results = []
df['LLM_eval'] = ''
for idx in range(len(df)): # len(df)
    print("idx is",idx)
    pairs = df.at[idx, 'combined']   # assume this is List[Tuple[list, list]]
    if not isinstance(pairs, list) or not pairs:
        llm_results.append(None)
        continue

    # build the named-pairs block in one go
    formatted = "\n".join(
        f"Pair {i+1}:\n  List1: {first}\n  List2: {second}"
        for i, (first, second) in enumerate(pairs)
    )

    prompt = (
        "For each of the following pairs, answer “Yes” if List1 contains a topic or limitation\n"
        "from List2, or List2 contains a topic or limitation from from List1; otherwise answer “No”.\n"
        "Respond *only* with a JSON object mapping each Pair name to “Yes” or “No”.\n\n"
        "Pairs:\n"
        f"{formatted}"
    )

    # single call per row
    resp_text = run_critic_openai(prompt)
    llm_results.append(resp_text)

    df.at[idx, 'LLM_eval'] = resp_text


In [None]:
import re
# extract all 'Yes', 'No'
pattern = r'"Pair\s*\d+"\s*:\s*"(Yes|No)"'

all_matches = []
for idx in range(len(df)):
    raw = df.at[idx, 'LLM_eval']
    if not isinstance(raw, str):
        all_matches.append([])
        continue
    matches = re.findall(pattern, raw)
    all_matches.append(matches)


In [None]:
import pandas as pd

rows = []
for idx, tuples in df['combined'].items():
    if not isinstance(tuples, list):
        continue
    # get the matching list for this row
    matches = all_matches[idx] if idx < len(all_matches) else []

    for j, (list1, list2) in enumerate(tuples):
        # grab the j-th match or None if out of range
        is_match = matches[j] if j < len(matches) else None

        rows.append({
            'source_row': idx,
            'List1':      list1,
            'List2':      list2,
            'is_match':   is_match
        })

result_df = pd.DataFrame(rows)

result_df.rename(
    columns={
        'List1': 'Ground_Truth',
        'List2': 'LLM_generated'
    },
    inplace=True
)


In [None]:
import re

def extract_leading_number(x):
    """
    If x is a list, grab its first element; then:
    • If it’s an int, return it.
    • If it’s a string starting with digits (with or without a dot), return those digits.
    Otherwise return None.
    """
    # step 1: if it’s a list, pull out the first item
    val = x[0] if isinstance(x, list) and x else x

    # step 2: if it’s already an int, just return it
    if isinstance(val, int):
        return val

    # step 3: if it’s a string, regex for leading digits
    if isinstance(val, str):
        # match “123.” or just “123”
        m = re.match(r'^\s*(\d+)(?:\.)?', val)
        if m:
            return int(m.group(1))

    return None

# extract into new columns
result_df['gt_number']        = result_df['Ground_Truth'].apply(extract_leading_number)
result_df['llm_gen_number']   = result_df['LLM_generated'].apply(extract_leading_number)


In [None]:
# ground truth coverage

# Initialize variables
current_section = None
section_has_yes = False
match = 0

# Iterate through the DataFrame
for index, row in result_df.iterrows():
    # Check if we are still in the same section
    if row['gt_number'] == current_section:
        # Check if there is a 'Yes' in 'is_match'
        if row['is_match'] == 'Yes':
            section_has_yes = True
    else:
        # We've reached a new section, check if the last section had a 'Yes'
        if section_has_yes:
            match += 1
        # Reset for new section
        current_section = row['gt_number']
        section_has_yes = (row['is_match'] == 'Yes')

# Check the last section after exiting the loop
if section_has_yes:
    match += 1
print(match)


# total number of unique ground truth

# Calculate consecutive blocks where 'ground_truth' is the same
unique_blocks = result_df['Ground_Truth'].ne(result_df['Ground_Truth'].shift()).cumsum()

# Group by these blocks and count each group
ck = result_df.groupby(unique_blocks)['gt_number'].agg(['count'])

# Output the results
print("Number of unique consecutive 'ground_truth' texts and their counts:")
print(ck)


Measuring Quality bewtween matched pairs (NLP based metrics)

Ground Truth: ground_truth || LLM_Generated limitation: llm_generated

In [None]:
# say you want to rename 'oldA'→'newA' and 'oldB'→'newB'
df.rename(columns={
    'Ground_Truth': 'ground_truth',
    'LLM_generated': 'llm_generated',
    # 'Is_same': 'is_match',
}, inplace=True)

In [None]:
# Drop rows where the column 'is_match' is 'no'
df_filtered = df[df['is_match'].str.lower() != 'no']

In [None]:
df_filtered = df_filtered.reset_index(drop=True)

In [None]:
# BERTScore (all)
!pip3 -q install bert-score


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

BERTScor for whole texts

In [None]:
import pandas as pd
from bert_score import BERTScorer

# Initialize the BERT scorer
scorer = BERTScorer(model_type='roberta-large', lang="en")

# Function to calculate BERTScore for each row using one loop
def calculate_bertscore(row):
    # Calculate BERT Scores directly for the ground_truth and llm_generated of the row
    _, _, F1 = scorer.score([row['ground_truth']], [row['llm_generated']])
    return F1.mean().item()  # Return the mean F1 score

# Apply the function to each row in the DataFrame
df_filtered['bert_score'] = df_filtered.apply(calculate_bertscore, axis=1)


In [None]:
# Calculate the average of the 'bert_score' column in df_highest_score
average_bert_score = df_filtered['bert_score'].mean()

# Display the average
average_bert_score


In [None]:
!pip3 -q install rouge_score

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import pandas as pd

# Initialize ROUGE scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate similarity metrics for each row
def calculate_metrics(row):
    metrics = {}

    # ROUGE scores
    rouge_scores = rouge_scorer.score(row['ground_truth'], row['llm_generated'])
    metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
    metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
    metrics['rougeL'] = rouge_scores['rougeL'].fmeasure

    # Cosine Similarity
    vectorizer = CountVectorizer().fit_transform([row['ground_truth'], row['llm_generated']])
    vectors = vectorizer.toarray()
    metrics['cosine_similarity'] = cosine_similarity(vectors)[0, 1]

    # Jaccard Similarity
    set1 = set(row['ground_truth'].split())
    set2 = set(row['llm_generated'].split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    metrics['jaccard_similarity'] = intersection / union if union > 0 else 0

    # BLEU Score
    metrics['bleu_score'] = sentence_bleu([row['ground_truth'].split()], row['llm_generated'].split())

    return metrics

# Apply the function to each row in the DataFrame and store results in new columns
metric_results = df_filtered.apply(calculate_metrics, axis=1)

# Expand the dictionary into separate columns
metric_results_df = pd.DataFrame(metric_results.tolist())
df_filtered = pd.concat([df_filtered, metric_results_df], axis=1)


In [None]:
# Calculate the average of each metric
average_metrics = {
    'Average ROUGE-1': df_filtered['rouge1'].mean(),
    'Average ROUGE-2': df_filtered['rouge2'].mean(),
    'Average ROUGE-L': df_filtered['rougeL'].mean(),
    'Average Cosine Similarity': df_filtered['cosine_similarity'].mean(),
    'Average Jaccard Similarity': df_filtered['jaccard_similarity'].mean(),
    'Average BLEU Score': df_filtered['bleu_score'].mean()
}

# Print the average metrics
average_metrics


Topic similarity

In [None]:
!pip3 -q install keybert

from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT()

# Ensure all entries are strings (even if NaN)
df_filtered['ground_truth'] = df_filtered['ground_truth'].fillna("").astype(str)
df_filtered['llm_generated'] = df_filtered['llm_generated'].fillna("").astype(str)

# Function to extract keywords using KeyBERT
def extract_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english')
    return [kw[0] for kw in keywords]  # Extract just the keywords

# Apply KeyBERT to 'ground_truth' and 'LLM_generated' columns
df_filtered['ground_truth_words'] = df_filtered['ground_truth'].apply(extract_keywords)
df_filtered['LLM_generated_words'] = df_filtered['llm_generated'].apply(extract_keywords)


In [None]:
# Function to compute Jaccard Similarity
def jaccard_similarity(row):
    set1 = set(row['ground_truth_words'])
    set2 = set(row['LLM_generated_words'])
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

# Apply Jaccard Similarity to each row
df_filtered['jaccard_similarity_topic'] = df_filtered.apply(jaccard_similarity, axis=1)
df_filtered['jaccard_similarity_topic'].mean()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute Cosine Similarity with empty check
def cosine_sim(row):
    gt = ' '.join(row['ground_truth_words'])
    llm = ' '.join(row['LLM_generated_words'])

    # If either is empty, return 0 similarity
    if not gt.strip() or not llm.strip():
        return 0.0

    try:
        vectorizer = CountVectorizer().fit_transform([gt, llm])
        vectors = vectorizer.toarray()
        return cosine_similarity(vectors)[0, 1]
    except ValueError:
        return 0.0  # fallback if vocabulary is still empty

df_filtered['cosine_similarity_topic'] = df_filtered.apply(cosine_sim, axis=1)
mean_sim = df_filtered['cosine_similarity_topic'].mean()
print(mean_sim)


In [None]:
import re

# Function to extract text between double asterisks
def extract_text_between_asterisks(text):
    matches = re.findall(r'\*\*(.*?)\*\*', text)
    return matches

# Apply the function to both columns and store results in new columns
df_filtered['ground_truth_extracted'] = df_filtered['ground_truth'].apply(extract_text_between_asterisks)
df_filtered['llm_generated_extracted'] = df_filtered['llm_generated'].apply(extract_text_between_asterisks)
