### Limitation Generation with 3 Agents by Llama 3 8B

In [None]:
import pandas as pd
import time
import signal
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

start_time = time.time()

# Global variable to store DataFrame
global_df = None
global_current_row = 0

def signal_handler(signum, frame):
    """Handle termination signals to save progress"""
    print(f"\n⚠️ Received signal {signum}. Saving progress before termination...")
    if global_df is not None:
        # Save current progress
        emergency_file = f"/emergency_save_{global_current_row}.csv"
        global_df.to_csv(emergency_file, index=False)
        print(f"  🚨 Emergency save completed: {emergency_file}")

        # Also save to final output
        output_file = "df_neurips_limitations_3_agents.csv"
        global_df.to_csv(output_file, index=False)
        print(f"  🚨 Final output updated: {output_file}")

    print("  📊 Progress saved. Exiting gracefully...")
    sys.exit(0)

In [None]:
# Register signal handlers
signal.signal(signal.SIGTERM, signal_handler)  # PBS termination
signal.signal(signal.SIGINT, signal_handler)   # Ctrl+C

# Load Llama 3 8B model and tokenizer (4-bit quantized)
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
cache_dir = "llama3_8b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

print("Loading Llama 3 8B model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    quantization_config=bnb_config,
    device_map="auto"
)

In [None]:
# Set pad token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def truncate_prompt_for_model(prompt: str, max_length: int = 8000) -> str:
    """Truncate prompt to fit within model's context window, leaving room for generation"""
    # Tokenize the prompt
    tokens = tokenizer.encode(prompt, return_tensors="pt")

    # If tokens exceed max_length, truncate
    if tokens.shape[1] > max_length:
        print(f"⚠️ Prompt token count = {tokens.shape[1]} exceeds limit ({max_length}). Truncating...")
        # Decode back to text, keeping only the first max_length tokens
        truncated_tokens = tokens[:, :max_length]
        return tokenizer.decode(truncated_tokens[0], skip_special_tokens=True)

    return prompt

In [None]:
def llama_generate(prompt, max_new_tokens=512):
    """Generate text using Llama 3 8B"""
    # Truncate prompt to fit within model's context window
    truncated_prompt = truncate_prompt_for_model(prompt, max_length=8000)

    # Tokenize with proper padding and attention mask
    inputs = tokenizer(
        truncated_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=8192,
        return_attention_mask=True
    ).to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

In [None]:
# Agent-specific prompts - each receives both paper_content and cited_papers
def get_extractor_prompt(paper_content: str, cited_papers: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in scientific literature analysis. Your task is to carefully read the provided scientific article
and extract all explicitly stated limitations as mentioned by the authors. Focus on sections such as Discussion, Conclusion, or
Limitations. List each limitation verbatim, including direct quotes where possible, and provide a brief context (e.g., what aspect of
the study the limitation pertains to). Ensure accuracy and avoid inferring or adding limitations not explicitly stated. If no limitations
are mentioned, state this clearly.

Workflow:

Plan: Outline which sections (e.g., Discussion, Conclusion, Limitations) to analyze and identify tools (e.g., text extraction) to
access the article content. Justify the selection of sections based on their likelihood of containing limitation statements.

Reasoning: Let's think step by step to ensure thorough and accurate extraction of limitations:
Step 1: Identify all sections in the article that may contain limitations. For example, the Discussion often includes limitations as
authors reflect on their findings, while a dedicated Limitations section is explicit.
Step 2: Use text extraction tools to retrieve content from these sections. Verify that the content is complete and accurate.
Step 3: Scan for explicit limitation statements, such as phrases like "a limitation of this study" or "we acknowledge that."
Document why each statement qualifies as a limitation.
Step 4: For each identified limitation, extract the verbatim quote (if available) and note the context (e.g., related to sample size,
methodology).
Step 5: Check for completeness by reviewing other potential sections (e.g., Conclusion) to ensure no limitations are missed.
Analyze: Use tools to extract and verify the article's content, focusing on explicit limitation statements. Cross-reference extracted
quotes with the original text to ensure accuracy.

Reflect: Verify that all relevant sections were checked and no limitations were missed. Consider whether any section might have been
overlooked and re-evaluate if necessary.
Continue: Do not terminate until all explicitly stated limitations are identified or confirmed absent.

Output Format:
Bullet points listing each limitation.
For each: Verbatim quote (if available), context (e.g., aspect of the study), and section reference.
If none: "No limitations explicitly stated in the article."

Tool Use:
Use text extraction tools to access and verify article content.
Do not assume content; retrieve it directly from the provided article.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"I selected the Discussion section because authors often discuss study constraints there."
"I found the phrase 'a limitation of this study' in the Limitations section, indicating an explicit limitation."
"I checked the Conclusion section to ensure no additional limitations were mentioned, confirming completeness."
This narrative ensures transparency and justifies each decision in the extraction process.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Cited Papers Information:
{cited_papers}

Please extract and list the key limitations found in this paper. Be specific and provide clear reasoning for each limitation identified.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_analyzer_prompt(paper_content: str, cited_papers: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a critical scientific reviewer with expertise in research methodology and analysis. Your task is to analyze
the provided scientific article and identify potential limitations not explicitly stated by the authors. Focus on aspects such as study
design, sample size, data collection methods, statistical analysis, scope of findings, and underlying assumptions. For each inferred
limitation, provide a clear explanation of why it is a limitation and how it impacts the study's validity, reliability,
or generalizability. Ensure inferences are grounded in the article's content and avoid speculative assumptions.

Workflow:
Plan: Identify key areas (e.g., methodology, sample size, statistical analysis) to analyze and select tools (e.g., text analysis) to
verify article details. Justify the selection based on their potential to reveal limitations.
Reasoning: Let's think step by step to identify inferred limitations:

Step 1: Review the article's methodology to identify gaps (e.g., study design flaws, sampling issues).
Step 2: Use text analysis tools to extract relevant details (e.g., sample size, statistical methods).
Step 3: Evaluate each area for potential limitations, such as small sample size affecting generalizability or unaddressed assumptions.
Step 4: Document why each gap qualifies as a limitation and its impact on the study.
Step 5: Ensure all key areas are covered to avoid missing potential limitations.
Analyze: Critically evaluate the article, using tools to confirm content, and infer limitations based on methodological or analytical gaps.
Reflect: Assess whether inferred limitations are grounded in the article and relevant to its validity, reliability, or generalizability.
Re-evaluate overlooked areas if necessary.

Continue: Iterate until all potential inferred limitations are identified.
Output Format:
Bullet points listing each inferred limitation.
For each: Description, explanation, and impact on the study.

Tool Use:
Use text analysis tools to verify article content (e.g., methodology, results).
Avoid assumptions; base inferences on retrieved content.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"The methodology section mentions a convenience sample, which may limit generalizability."
"The statistical analysis lacks adjustment for confounders, potentially affecting validity."
"I checked the results section to ensure no additional gaps were missed."
This narrative ensures transparency and justifies each inferred limitation.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Cited Papers Information:
{cited_papers}

Please provide a detailed analysis of the limitations in this research. Consider both obvious and subtle limitations that could affect the validity and applicability of the findings.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_reviewer_prompt(paper_content: str, cited_papers: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in open peer review with a focus on transparent and critical evaluation of scientific research.
Your task is to review the provided scientific article from the perspective of an external peer reviewer. Identify potential limitations
that might be raised in an open review process, considering common critiques such as reproducibility, transparency, generalizability,
or ethical considerations. Leverage insights from similar studies or common methodological issues in the field by searching the web or
X posts for context, if needed.

Workflow:
Plan: Identify areas for review (e.g., reproducibility, transparency, ethics) and plan searches for external context
(e.g., similar studies, methodological critiques). Justify the selection based on peer review standards.
Reasoning: Let's think step by step to identify peer-review limitations:

Step 1: Select key areas for review (e.g., reproducibility, ethics) based on common peer review critiques.
Step 2: Use text analysis tools to extract relevant article details (e.g., methods, data reporting).
Step 3: Identify potential limitations, such as lack of transparency in data or ethical concerns, and justify using article content.
Step 4: Search web/X for external context (e.g., similar studies) to support limitations, rating source relevance
(high, medium, low, none).
Step 5: Synthesize findings, ensuring limitations align with peer review standards and are supported by article or external context.
Analyze: Critically review the article, integrating external context to identify limitations. Use tools to verify content and sources.
Reflect: Verify that limitations align with peer review standards and are supported by the article or external context.
Re-evaluate overlooked areas if necessary.

Continue: Iterate until all relevant peer-review limitations are identified.

Output Format:
Bullet points listing each limitation.
For each: Description, why it's a concern, and alignment with peer review standards.
Include citations for external sources in the format Source Name, if used.

Tool Use:
Use web/X search tools to find relevant literature or methodological critiques.
Use text analysis tools to verify article content.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"I selected reproducibility because peer reviewers often critique data availability."
"The article lacks a data sharing statement, which limits reproducibility."
"A web search revealed similar studies provide data openly, supporting this limitation."
This narrative ensures transparency and justifies each identified limitation.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Paper Content:
{paper_content}

Cited Papers Information:
{cited_papers}

Please provide a critical review identifying the limitations and areas of concern in this research. Consider what a peer reviewer would highlight as weaknesses or areas needing improvement.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [None]:
def get_merger_prompt(extractor_output: str, analyzer_output: str, reviewer_output: str) -> str:
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a **Master Coordinator**, an expert in scientific communication and synthesis. Your task is to integrate limitations provided by three specialized agents:

**Agents:**
1. **Extractor** (explicit limitations from the article),
2. **Analyzer** (inferred limitations from critical analysis),
3. **Reviewer** (limitations from an open review perspective).

**Goals**:
1. Combine all limitations into a cohesive, non-redundant list.
2. Ensure each limitation is clearly stated, scientifically valid, and aligned with the article's content.
3. Prioritize critical limitations that affect the paper's validity and reproducibility.
4. Format the final list in a clear, concise, and professional manner, suitable for a scientific review or report.

**Workflow**:
1. **Plan**: Outline how to synthesize limitations, identify potential redundancies, and resolve discrepancies.
2. **Analyze**: Combine limitations, prioritizing critical ones, and verify alignment with the article.
3. **Reflect**: Check for completeness, scientific rigor, and clarity.
4. **Continue**: Iterate until the list is comprehensive, non-redundant, and professionally formatted.

**Output Format**:
- Numbered list of final limitations.
- For each: Clear statement, brief justification, and source in brackets (e.g., [Author-stated], [Inferred], [Peer-review-derived]).

<|eot_id|><|start_header_id|>user<|end_header_id|>

Extractor Agent Analysis:
{extractor_output}

Analyzer Agent Analysis:
{analyzer_output}

Reviewer Agent Analysis:
{reviewer_output}

Please merge these three different perspectives on the paper's limitations into a comprehensive, well-organized analysis. Synthesize the insights, resolve any contradictions, and provide a unified view of the paper's limitations.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""


In [None]:
# Function to run a specific agent
def run_agent(agent_name: str, paper_content: str, cited_papers: str, agent_prompt_func) -> str:
    """Run a specific agent and return its output"""
    print(f"  Running {agent_name} agent...")
    try:
        prompt = agent_prompt_func(paper_content, cited_papers)
        response = llama_generate(prompt, max_new_tokens=512)
        print(f"  {agent_name} agent completed")
        return response.strip()
    except Exception as e:
        print(f"  Error in {agent_name} agent: {e}")
        return f"ERROR in {agent_name} agent: {str(e)}"

print("Loading CSV file...")
try:
    df = pd.read_csv("df_neruips_21_22_final.csv")
    print(f"Successfully loaded CSV file with shape: {df.shape}")
except FileNotFoundError:
    print("Error: CSV file not found. Please check the file path.")
    exit(1)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    exit(1)

# Check if required columns exist
required_columns = ['df_Abstract', 'df_Introduction', 'df_Related_Work', 'df_Methodology',
                   'df_Dataset', 'df_Conclusion', 'df_Experiment_and_Results',
                   'LLM_extracted_future_work', 'relevance_8_cited_in', 'relevance_8_cited_by']

missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Warning: Missing columns: {missing_columns}")
    print(f"Available columns: {df.columns.tolist()}")

# Create combined content column
df['combined'] = df.apply(
    lambda row: (
        f"Abstract: {row.get('df_Abstract', '')}\n"
        f"Introduction: {row.get('df_Introduction', '')}\n"
        f"Related_Work: {row.get('df_Related_Work', '')}\n"
        f"Methodology: {row.get('df_Methodology', '')}\n"
        f"Dataset: {row.get('df_Dataset', '')}\n"
        f"Conclusion: {row.get('df_Conclusion', '')}\n"
        f"Experiment_and_Results: {row.get('df_Experiment_and_Results', '')}\n"
        f"LLM_extracted_future_work: {row.get('LLM_extracted_future_work', '')}\n"
        f"Extra1: {row.get('df_col_2', '')}\n"
        f"Extra2: {row.get('df_col_3', '')}\n"
        f"Extra3: {row.get('df_col_4', '')}\n"
        f"Extra4: {row.get('df_col_5', '')}\n"
        f"Extra5: {row.get('df_col_6', '')}\n"
        f"Extra6: {row.get('df_col_7', '')}\n"
        f"Extra7: {row.get('df_col_8', '')}"
    ),
    axis=1
)

print(f"DataFrame shape: {df.shape}")
print("Processing all samples with three agents...")

### RAG Settings

In [None]:
!pip3 -q install rank_bm25
!pip3 -q install llama-index-llms-langchain
!pip3 -q install langchain_community
!pip3 -q install llama_index
!pip3 -q install sentence_transformers
!pip3 -q install langchain

import os
import sys
import io
import time
import logging
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from openai import OpenAI

In [None]:
# API key
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# LangChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.schema import Document as LCDocument
from langchain.schema import SystemMessage, HumanMessage
from langchain.document_loaders import CSVLoader, DirectoryLoader, TextLoader, DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.core.schema import Document as LIDoc
from llama_index.core.service_context import ServiceContext
from llama_index.core.service_context_elements.llm_predictor import LLMPredictor
from llama_index.embeddings.langchain import LangchainEmbedding

# Sentence Transformers
from sentence_transformers import SentenceTransformer

# Tokenizer
import tiktoken

# -------------------------------
# Token budget helpers
# -------------------------------
MAX_CONTEXT_TOKENS = 127_000
MODEL_NAME = "gpt-4o-mini"

def truncate_for_context(query: str, passages: list[str],
    max_tokens: int = MAX_CONTEXT_TOKENS,
    model: str = MODEL_NAME,
) -> list[str]:
    enc = tiktoken.encoding_for_model(model)
    q_tokens = enc.encode(query, disallowed_special=())
    budget = max_tokens - len(q_tokens)
    kept, used = [], 0
    for p in passages:
        p_toks = enc.encode(p, disallowed_special=())
        if used + len(p_toks) > budget:
            if budget - used > 0:
                kept.append(enc.decode(p_toks[:(budget - used)]))
            break
        kept.append(p)
        used += len(p_toks)
    return kept

def count_tokens(text: str, model: str = MODEL_NAME) -> int:
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text, disallowed_special=()))

def ensure_passages_within_budget(
    query: str,
    passages: list[str],
    max_tokens: int = MAX_CONTEXT_TOKENS,
    model: str = MODEL_NAME,
) -> list[str]:
    total = count_tokens(query + "\n\n".join(passages), model=model)
    if total <= max_tokens:
        return passages
    print(f"Truncating context ({total} tokens)…")
    return truncate_for_context(query, passages, max_tokens=max_tokens, model=model)

# -------------------------------
# Retriever setup
# -------------------------------
hf_emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def make_retriever_for_docs(docs, k=3):
    # FAISS
    faiss_store = FAISS.from_documents(docs, hf_emb)
    faiss_r = faiss_store.as_retriever(search_kwargs={"k": k})

    # BM25
    bm25_r = BM25Retriever.from_documents(docs)
    bm25_r.k = k

    # Ensemble retriever
    return EnsembleRetriever(
        retrievers=[faiss_r, bm25_r],
        weights=[0.5, 0.5]
    )

# -------------------------------
# QA pipeline
# -------------------------------
chat_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

def generate_limitations(question, retriever):
    qa = RetrievalQA.from_chain_type(
        llm=chat_llm,
        chain_type="stuff",   # simplest: stuff docs into prompt
        retriever=retriever,
        return_source_documents=False
    )
    return qa.run(question).strip()


In [None]:
from langchain.schema import SystemMessage, HumanMessage
from langchain.chat_models import ChatOpenAI
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
import os, sys, logging

# ─── API Key ─────────────────────────────────────────────
os.environ['OPENAI_API_KEY'] = ''

# ─── Logging ─────────────────────────────────────────────
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# ─── Model + Embeddings ──────────────────────────────────
MODEL_NAME         = "gpt-4o-mini"
MAX_CONTEXT_TOKENS = 127_000

chat_llm = ChatOpenAI(model_name=MODEL_NAME, temperature=0)

# SentenceTransformer (manual embeddings if needed)
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# LangChain wrapper for embeddings
hf_emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ─── Helper to extract text from docs ─────────────────────
def get_doc_text(d):
    if hasattr(d, "text"):
        return d.text
    if hasattr(d, "get_content"):
        return d.get_content()
    if hasattr(d, "content"):
        return d.content
    raise AttributeError(f"No text attr on {type(d)}")

# ─── Input sections and reference suffixes ────────────────
main_cols = [
    "df_Abstract",
    "df_Introduction",
    "df_Related_Work",
    "df_Methodology",
    "df_Dataset",
    "df_Conclusion",
    "df_Experiment_and_Results"
]

ref_suffixes = [
    "Introduction",
    "Related_Work",
    "Methodology",
    "Dataset",
    "Conclusion",
    "Experiment_and_Results",
    "Limitation",
    "Extra"
]

# ─── System Prompt ───────────────────────────────────────
system_prompt = """You are a helpful, respectful, and honest assistant
for generating limitations or shortcomings of a research paper.
Generate limitations or shortcomings for the following passages
from the scientific paper.
"""

# ─── Unified helper for LLM calls ─────────────────────────
def run_critic(prompt: str, *, system_prompt: str | None = None) -> str:
    """
    Wraps ChatOpenAI to (optionally) send a system prompt + user prompt,
    and returns the assistant's reply as a stripped string.
    """
    messages: list[SystemMessage | HumanMessage] = []

    if system_prompt:
        messages.append(SystemMessage(content=system_prompt))

    messages.append(HumanMessage(content=prompt))

    response = chat_llm.invoke(messages)
    return response.content.strip()


In [None]:
Rel_Prompt = '''You are a Relevance Evaluation Agent, an expert in assessing the relevance of retrieved text chunks from a vector
database against an input query for the task of generating limitations of scientific articles. Your task is to evaluate the relevance
of 10 retrieved text chunks against an input query, which consists of a scientific paper (including sections: Abstract, Introduction,
Methodology, Related Work, Experiment and Results, Limitations, and Future Work) and its rewritten version. For each chunk, assign a
relevance score from 1 (least relevant) to 10 (most relevant) based on semantic and contextual alignment with the input query, and
provide a brief justification for the score.

Input:

Input Query: [The full text of the original scientific paper and its rewritten version]
Retrieved Text Chunks: A list of 10 text chunks, each with a unique identifier and content, formatted as:

Chunk 1: [chunk_id_1]: [retrieved_text_1]
Chunk 2: [chunk_id_2]: [retrieved_text_2]
Chunk 3: [chunk_id_3]: [retrieved_text_3]
Chunk 4: [chunk_id_4]: [retrieved_text_4]
Chunk 5: [chunk_id_5]: [retrieved_text_5]
Chunk 6: [chunk_id_6]: [retrieved_text_6]
Chunk 7: [chunk_id_7]: [retrieved_text_7]
Chunk 8: [chunk_id_8]: [retrieved_text_8]
Chunk 9: [chunk_id_9]: [retrieved_text_9]
Chunk 10: [chunk_id_10]: [retrieved_text_10]

Instructions:

Evaluate Relevance: For each of the 10 retrieved text chunks, assess its relevance to the input query based on semantic and contextual
alignment with the original and rewritten scientific paper. Consider how closely the chunk matches key concepts, arguments, or details
in the query.

Assign Relevance Score:
High Scores (8–10): The chunk has strong semantic and contextual alignment with the input query, closely matching key concepts or details.
Prioritize chunks containing limitations (e.g., study constraints, challenges) or methodological summaries (e.g., study design, methods),
boosting their score by 1–2 points if they align well with the query.

Medium Scores (4–7): The chunk has moderate semantic and contextual alignment, containing relevant but less central content (e.g., results,
general context, or partial methodological details).

Low Scores (1–3): The chunk has minimal or no semantic and contextual alignment, such as unrelated content, generic statements, or
off-topic information.

Prioritize Limitations and Methodology: Chunks explicitly discussing limitations (e.g., sample size, data constraints, scope issues) or
methodological summaries (e.g., study design, experimental setup) are highly relevant. Boost their score by 1–2 points if they align
well with the input query, compared to other relevant content.

Provide Justification: For each chunk, include a brief justification explaining the assigned score, referencing the chunk’s semantic and
contextual alignment with the input query and noting whether it contains limitations or methodological summaries.

Do Not Modify Text: Evaluate each chunk as provided, without modifying or paraphrasing the retrieved text.

Handle Irrelevant Chunks: If a chunk is unrelated to the input query or lacks meaningful content, assign a score of 1 with an appropriate
justification.

Workflow:
Plan: Review the input query (original and rewritten paper) and the 10 retrieved text chunks to understand their content and context.

Reasoning:
Step 1: For each chunk, identify its main topic or content (e.g., limitations, methodology, results, background).
Step 2: Compare the chunk’s content to the input query, assessing semantic and contextual alignment with the paper’s sections
(e.g., Limitations, Methodology).
Step 3: Assign a relevance score (1–10) based on alignment, prioritizing limitations and methodological summaries.
Step 4: Write a brief justification for the score, explaining the chunk’s relevance and any priority given to limitations or methodology.
Step 5: Verify the score and justification are accurate and consistent with the chunk’s content and the input query.

Analyze: Use text analysis tools to confirm semantic alignment (e.g., keyword matching for “limitation,” “constraint,” “methodology,” “sample size”) and assess relevance to the input query.
Reflect: Ensure scores and justifications are fair, consistent, and reflect the chunk’s alignment with the query, re-evaluating any ambiguous cases.
Continue: Iterate until all 10 chunks are evaluated with scores and justifications.

Tool Use:
Use text analysis tools to identify limitation-related or methodology-related keywords (e.g., “limited,” “constraint,” “sample size,” “methodology”) and assess semantic similarity between chunks and the input query.
Use semantic similarity checks to confirm alignment between the chunk and the query’s key concepts.

Chain of Thoughts: Document the reasoning process internally for each chunk. For example:
“This chunk mentions a small sample size, a limitation, and aligns closely with the query’s focus, so it receives a high score (9).”
“This chunk discusses results without addressing limitations or methodology, so it receives a medium score (6).”
“This chunk is generic and unrelated to the query’s specific content, so it receives a low score (1).”

Output Format: The output must be in strict JSON format, containing an array of 10 objects, one for each retrieved text chunk, with the
following structure for each object:
"Chunk_number": [Chunk number, e.g., "Chunk 1", "Chunk 2", ..., "Chunk 10"]
"relevance_score": [Integer from 1 to 10]
"justification": [Brief explanation of the score, referencing the chunk’s semantic and contextual alignment with the query and any emphasis on limitations or methodological summaries]

Example: Input: Input Query: [Full text of the original scientific paper and its rewritten version] Retrieved Text Chunks:

Chunk 1: chunk_001: The study was limited by a small sample size, which may affect generalizability.
Chunk 2: chunk_002: The experiment used a randomized controlled trial design to test the algorithm.
Chunk 3: chunk_003: The experiment achieved a 20% improvement in processing speed.
...
Chunk 10: chunk_010: Data processing is a key challenge in modern research.

Output: [ { "Chunk_number": "Chunk 1", "relevance_score": 9, "justification": "The chunk has strong semantic and contextual alignment with
the input query, explicitly discussing a limitation (small sample size), which is a high-priority element for limitation generation." },
{ "Chunk_number": "Chunk 2", "relevance_score": 8, "justification": "The chunk aligns well with the input query by describing the
methodological approach, a high-priority element, though it is slightly less central than limitations-related content." },
{ "Chunk_number": "Chunk 3", "relevance_score": 6, "justification": "The chunk has moderate semantic and contextual alignment,
discussing experimental results, but lacks focus on limitations or methodology, resulting in a mid-range score." },
...
{ "Chunk_number": "Chunk 10", "relevance_score": 3, "justification": "The chunk provides generic background information with minimal
semantic and contextual alignment to the input query’s specific concepts or arguments." } ] '''

### measure relevance score with each chunk with input

In [None]:
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
import tiktoken

MODEL_NAME = "gpt-4o-mini"
MAX_CONTEXT_TOKENS = 127_000
hf_emb = HuggingFaceEmbeddings()
enc = tiktoken.encoding_for_model(MODEL_NAME)

# Initialize column
df1["cited_by_top_20_raw"] = None
df1["cited_by_top_20_texts"] = None
df1["cited_by_top_20_meta"] = None
df1["retrieved_text_llm_asses"] = None

# Iterate through df1
for i, row in df1.iloc[:325].iterrows():
    print("index",i)
    cited_list = row.get("cited_by_full_text", [])
    all_docs = []

    for j, cited_dict in enumerate(cited_list):
        abstract = cited_dict.get("abstractText", "")
        sections = cited_dict.get("sections", [])
        row_num = cited_dict.get("row_number", "")
        file_name = cited_dict.get("file_name", "")

        if isinstance(abstract, str) and abstract.strip():
            all_docs.append(Document(
                page_content="Abstract: " + abstract.strip(),
                metadata={"row_number": row_num, "file_name": file_name, "position": j}
            ))

        for sec in sections:
            if isinstance(sec, dict):
                heading = sec.get("heading", "").strip()
                text = sec.get("text", "").strip()
                if text:
                    combined = f"{heading}: {text}" if heading else text
                    all_docs.append(Document(
                        page_content=combined,
                        metadata={"row_number": row_num, "file_name": file_name, "position": j}
                    ))

    if not all_docs:
        continue

    # Split into chunks
    splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=32)
    chunked_docs = splitter.split_documents(all_docs)
    print(f"Total number of chunks for row {i}: {len(chunked_docs)}")

    # Create retrievers
    faiss_store = FAISS.from_documents(chunked_docs, hf_emb)
    faiss_r = faiss_store.as_retriever(search_kwargs={"k": 20})
    bm25_r = BM25Retriever.from_documents(chunked_docs)
    bm25_r.k = 20

    # Ensemble Retriever
    ensemble = EnsembleRetriever(
        retrievers=[faiss_r, bm25_r],
        weights=[0.5, 0.5]
    )

    # Use row-specific query
    example_query = row.get("response_string_neurips", "")
    input_query = (
        f"Scientific paper:\n{row['response_string_neurips']}\n\n"
        f"Rewritten version of scientific paper:\n{row['Input_Query_rewrite']}"
    )
    if not input_query.strip():
        continue

    # Retrieve top 20 and store
    top20 = ensemble.get_relevant_documents(input_query)
    top20 = top20[:20]

    # Store top20 as list of strings in the column
    df1.at[i, "cited_by_top_20_raw"] = top20  # stores full Document objects (content + metadata)
    df1.at[i, "cited_by_top_20_texts"] = [doc.page_content for doc in top20]  # just the texts
    df1.at[i, "cited_by_top_20_meta"] = [doc.metadata for doc in top20]  # just the metadata

    # ---------- 6) Batch LLM Scoring ---------------
    all_llm_scores = []

    # Rel_Prompt = "You are a helpful assistant tasked with evaluating text relevance."  # or load externally

    prefix = (
        f"{Rel_Prompt}\n\n"
        f"Input Query:\n{example_query}\n\n"
        "Here are up to 10 retrieved text chunks:\n"
    )
    prefix_len = count_tokens(prefix, model=MODEL_NAME)

    question = (
        "\\nOn a scale of 1–10, how relevant is each chunk to the above Input Query? "
        "Respond with JSON array with Chunk Number, Score, and Justification for each chunk."
    )
    question_len = count_tokens(question, model=MODEL_NAME)

    for batch_start in (0, 10):
        batch_docs = top20[batch_start:batch_start+10]
        if not batch_docs:
            continue  # skip empty batches
        batch_texts = [d.page_content for d in batch_docs]

        # truncate to token budget
        available = MAX_CONTEXT_TOKENS - prefix_len - question_len
        kept, used = [], 0
        for p in batch_texts:
            toks = enc.encode(p, disallowed_special=())
            if used + len(toks) > available:
                break
            kept.append(p)
            used += len(toks)

        chunks_list = "\n\n".join(
            f"Chunk {batch_start+idx+1}: {text}" for idx, text in enumerate(kept)
        )
        prompt = prefix + chunks_list + question

        # Call the LLM to assess
        raw = run_critic(prompt)
        all_llm_scores.append(raw)  # you can also parse JSON if needed

    # Save LLM assessments
    df1.at[i, "retrieved_text_llm_asses"] = all_llm_scores

    # Optional debug print
    if i == 0 or i == 20:
        print(f"all_llm_scores for row {i}:", all_llm_scores)

df1.to_csv("df.csv",index=False)

In [None]:
import pandas as pd
import ast
import json
import re

# Placeholder: simulate loading df
# df = pd.read_csv("your_path.csv")

# Prepare a new column to store high-score chunks
df1["top_chunks_texts"] = None

def extract_top_chunks(row):
    try:
        raw = row["retrieved_text_llm_asses"]
        if isinstance(raw, str):
            # Convert string to list
            raw_list = ast.literal_eval(raw)
        elif isinstance(raw, list):
            raw_list = raw
        else:
            return None

        all_chunks = []
        for entry in raw_list:
            # Extract the JSON block
            match = re.search(r"\[.*\]", entry, re.DOTALL)
            if not match:
                continue
            try:
                chunk_json = json.loads(match.group())
                for item in chunk_json:
                    score = item.get("relevance_score", 0)
                    chunk_idx = int(item.get("Chunk_number", "Chunk 0").split()[-1]) - 1
                    if score >= 7:
                        # Safely get the chunk from cited_by_top_20_texts
                        text_list = row.get("cited_by_top_20_texts", [])
                        if isinstance(text_list, list) and 0 <= chunk_idx < len(text_list):
                            all_chunks.append(text_list[chunk_idx])
            except Exception as e:
                continue
        return all_chunks if all_chunks else None
    except Exception as e:
        return None

# Apply to all rows
df1["top_chunks_texts"] = df1.apply(extract_top_chunks, axis=1)


In [None]:
import ast
import pandas as pd
import numpy as np

# Only apply ast.literal_eval to non-null strings
df1['top_chunks_texts'] = df1['top_chunks_texts'].apply(
    lambda x: ast.literal_eval(x) if pd.notnull(x) else np.nan
)

In [None]:
import ast
import pandas as pd

df['retrieved_text_llm_asses'] = (
    df['retrieved_text_llm_asses']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)
df['top20_docs'] = (
    df['top20_docs']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)

In [None]:
# take chunk where relevance score 8 or more

def pick_high_relevance(row, threshold=8):
    docs  = row['top20_docs']
    asses = row['retrieved_text_llm_asses']
    # find all indices where relevance_score ≥ threshold
    idxs = [i for i, d in enumerate(asses)
            if isinstance(d, dict) and d.get('relevance_score', 0) >= threshold]
    # pull the same‐indexed items from top20_docs (guarding against bad indices)
    return [docs[i] for i in idxs if i < len(docs)]

# create a new column with the selected docs
df['relevance_8_cited_in'] = df.apply(pick_high_relevance, axis=1)


In [None]:
import ast
import pandas as pd

df['cited_by_top_20_texts'] = (
    df['cited_by_top_20_texts']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)

In [None]:
df['retrieved_text_llm_asses_cited_by'] = (
    df['retrieved_text_llm_asses_cited_by']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)

In [None]:
import json
import re

def extract_chunk_dicts(cell):
    """
    cell is expected to be a list of strings, each string containing a
    ```json ... ``` block holding a JSON array of chunk‐dicts.
    This returns a flat list of all dicts.
    """
    out = []
    for s in cell or []:
        # 1) remove the ```json fences
        s_clean = re.sub(r'^```json\s*', '', s.strip())
        s_clean = re.sub(r'```$',      '', s_clean.strip())

        # 2) parse the JSON
        try:
            data = json.loads(s_clean)
        except json.JSONDecodeError:
            continue

        # 3) if it’s a list of dicts, extend; otherwise skip
        if isinstance(data, list):
            out.extend(d for d in data if isinstance(d, dict))
    return out

# apply to your DataFrame
df['retrieved_text_llm_asses_cited_by_upd'] = df['retrieved_text_llm_asses_cited_by'].apply(extract_chunk_dicts)


In [None]:
# take chunk where relevance score 8 or more

def pick_high_relevance(row, threshold=8):
    docs  = row['cited_by_top_20_texts']
    asses = row['retrieved_text_llm_asses_cited_by_upd']
    # find all indices where relevance_score ≥ threshold
    idxs = [i for i, d in enumerate(asses)
            if isinstance(d, dict) and d.get('relevance_score', 0) >= threshold]
    # pull the same‐indexed items from top20_docs (guarding against bad indices)
    return [docs[i] for i in idxs if i < len(docs)]

# create a new column with the selected docs
df['relevance_8_cited_by'] = df.apply(pick_high_relevance, axis=1)


In [None]:
Citation_agent = '''You are an expert scientific research assistant tasked with inferring potential limitations for an unspecified
current scientific article based solely on its cited papers.
You are given information from multiple cited papers, which are assumed to be referenced by the current article.
Your goal is to analyze these cited works and identify possible limitations that the current paper may have, by
comparing its presumed scope, methods, or results against the cited literature.
Because the input paper itself is not provided, you must reason from the cited papers alone, identifying what
gaps, stronger methods, broader coverage, or alternative results the cited works might expose in the hypothetical
current paper that cites them.

Objective:

Generate a list of scientifically grounded limitations that the current article might have, assuming it builds upon or is informed by the provided cited papers.

Each limitation should:

Be concise

Reference the relevant cited paper(s) by title

Clearly explain how the cited paper exposes a potential limitation

Be plausible and insightful based on common scientific reasoning

Workflow:
Plan:
Identify key insights, strengths, and scopes of the cited papers that could set a high bar or reveal blind spots
in a hypothetical citing article.

Reasoning: Let's think step by step to infer limitations:
Review each cited paper to extract its methodology, findings, and scope.
Ask: If a paper cited this work but did not adopt or address its insights, what limitation might arise?
Identify where the cited paper offers better methodology, broader scope, or contradicting findings.
Formulate each limitation as a plausible shortcoming of a hypothetical article that builds on—but possibly
underutilizes—these cited works.

Justify each limitation based on specific attributes of the cited paper (e.g., "more comprehensive dataset",
"stronger evaluation metric", etc.)

Analyze:
Develop a set of inferred limitations, each tied to specific cited paper(s) and grounded in logical comparison.

Reflect:
Ensure coverage of all relevant cited papers and validate that each limitation is scientifically plausible in
context.

Output Format:
Bullet points listing each limitation.
For each: Description, explanation, and reference to the cited paper(s) in the format Paper Title.

Tool Use (if applicable):

Use citation lookup tools or document content to extract accurate summaries.
Do not assume details about the input paper—focus only on drawing limitations based on differences, omissions,
or underuse of the cited works.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"I selected [Paper X] because it uses a more robust method than the current article."
"The current article's simpler method may limit accuracy compared to [Paper X]."
"I reviewed all cited papers to ensure no relevant gaps were missed."
This narrative ensures transparency and justifies each identified limitation.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Cited Papers Information:
{cited_papers}

Please identify limitations that would be relevant for researchers who might cite this paper in future work.
Consider what limitations future authors might mention when discussing this paper's contribution to the field,
based on the cited papers context.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''


Generate limitations using cited in + cited by papers

In [None]:
# Define the column names to concatenate
cols_to_concat = [
    "neurips_Abstract",
    "neurips_Introduction",
    "neurips_Related_Work",
    "neurips_Methodology",
    "neurips_Dataset",
    "neurips_Conclusion",
    "neurips_Experiment_and_Results",
    "neurips_Extra"
]

# Create a new column 'response_string_neurips' with labeled concatenation
def concat_with_labels(row):
    parts = []
    for col in cols_to_concat:
        if isinstance(row.get(col), str) and row[col].strip():
            label = col.replace("neurips_", "").replace("_", " ")
            parts.append(f"{label}: {row[col].strip()}")
    return "\n\n".join(parts)

df["response_string"] = df.apply(concat_with_labels, axis=1)

In [None]:
import re
import pandas as pd
import tiktoken

# Tokenization setup
encoding   = tiktoken.encoding_for_model("gpt-4o-mini")
max_tokens = 128000

def truncate_to_max_tokens(text: str, max_length: int) -> str:
    tokens = encoding.encode(text)
    return encoding.decode(tokens[:max_length]) if len(tokens) > max_length else text

# Make sure the output column exists
df['citation_agent_in_by_8'] = ''

# Process each row
for i in range(len(df)): # len(df)
    print(f"Processing row {i}...")
    row = df.iloc[i]

    # 1) Collect all items from relevance_8_cited_in
    cited_in_list = row.get('relevance_8_cited_in', []) or []
    cited_in_texts = []
    for itm in cited_in_list:
        # if it's a dict with 'text' key, grab that, otherwise str(itm)
        if isinstance(itm, dict) and 'text' in itm:
            cited_in_texts.append(itm['text'])
        else:
            cited_in_texts.append(str(itm))

    # 2) Collect all items from relevance_8_cited_by
    cited_by_list = row.get('relevance_8_cited_by', []) or []
    cited_by_texts = []
    for itm in cited_by_list:
        if isinstance(itm, dict) and 'text' in itm:
            cited_by_texts.append(itm['text'])
        else:
            cited_by_texts.append(str(itm))

    # 3) Build the combined prompt section
    cited_in_block = "\n".join(cited_in_texts)
    cited_by_block = "\n".join(cited_by_texts)

    combined_cited_input = (
        "Referenced papers:\n" + cited_in_block +
        "\n\nPapers who cited this paper:\n" + cited_by_block
    )

    input_paper = df['response_string'][i]
    prompt = Citation_agent + (
        "You are an assistant tasked to generate limitations or shortcomings "
        "in a scientific article. Below is the input paper:\n"
        f"{input_paper}\n\n"
        " Below is the relevant text from both the papers "
        "that this article cites and those that cite it.\n\n"
        f"{combined_cited_input}\n\n"
        "Please generate limitations based on this information."
    )

    # 5) Truncate and call LLM
    truncated = truncate_to_max_tokens(prompt, max_tokens)
    try:
        llm_summary = azure_run_critic(truncated)
    except Exception as e:
        print(f"Error at row {i}: {e}")
        llm_summary = "ERROR"

    df.at[i, "citation_agent_in_by_8"] = llm_summary


In [None]:
# Initialize new columns for each agent
df['limitations_extractor'] = ""
df['limitations_analyzer'] = ""
df['limitations_reviewer'] = ""
df['limitations_merged_final'] = ""

# Process each row with all agents
for i in range(len(df)): # len(df)
    print("i is", i)
    global_current_row = i + 1  # Update global counter
    global_df = df  # Update global DataFrame reference

    print(f"\n=== Processing row {i+1}/{len(df)} ===")
    row = df.iloc[i]
    paper_content = row['combined']

    # Get cited papers information for all agents
    cited_in = row.get('relevance_8_cited_in', '')
    cited_by = row.get('relevance_8_cited_by', '')
    cited_papers = f"Papers cited by this article:\n{cited_in}\n\nPapers that cited this article:\n{cited_by}"
    # Run all three agents
    extractor_output = run_agent("Extractor", paper_content, cited_papers, get_extractor_prompt)
    analyzer_output = run_agent("Analyzer", paper_content, cited_papers, get_analyzer_prompt)
    reviewer_output = run_agent("Reviewer", paper_content, cited_papers, get_reviewer_prompt)

    # Store individual agent outputs
    df.at[i, 'limitations_extractor'] = extractor_output
    df.at[i, 'limitations_analyzer'] = analyzer_output
    df.at[i, 'limitations_reviewer'] = reviewer_output

    # Merge all agent outputs
    try:
        merger_prompt = get_merger_prompt(extractor_output, analyzer_output, reviewer_output)
        merged_output = llama_generate(merger_prompt, max_new_tokens=512)
        df.at[i, 'limitations_merged_final'] = merged_output.strip()
    except Exception as e:
        df.at[i, 'limitations_merged_final'] = f"ERROR in Master Coordinator agent: {str(e)}"

    print(f"  Row {i+1} completed")

    # Save progress every 5 rows to prevent data loss
    if i % 5 == 0:
        output_file = "df_neurips_limitations_3_agents.csv"
        df.to_csv(output_file, index=False)
        print(f"  ✅ Checkpoint saved at row {i+1}")

# Save results
output_file = "df_neurips_limitations_3_agents.csv"
df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")

end_time = time.time()
elapsed = end_time - start_time


### Evaluation

Ground Truth coverage

In [None]:
# making lists of list 'master_agent' text
import re

# make sure the output column exists
df['limitations_merged_final_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "limitations_merged_final"]
    # skip if missing or not a string
    if not isinstance(raw, str):
        df.at[row_idx, 'limitations_merged_final_list'] = []
        continue

    # split on double-newline before a numbered item
    parts = re.split(r'\n\n(?=\d+\.)', raw.strip())

    lim_list = []
    for part in parts:
        m = re.match(r'(\d+)\.\s*(.*)', part, re.S)
        if not m:
            continue
        num  = int(m.group(1))
        text = m.group(2).strip()
        lim_list.append([num, text])

    df.at[row_idx, 'limitations_merged_final_list'] = lim_list


In [None]:
# making lists of list 'ground truth' text

import re

# ensure the output column exists
df['Lim_and_OR_ground_truth_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "Lim_and_OR_ground_truth_final"]
    # skip non-strings
    if not isinstance(raw, str):
        df.at[row_idx, 'Lim_and_OR_ground_truth_list'] = []
        continue

    # split on double-newline before a numbered item
    parts = re.split(r'\n\n(?=\d+\.)', raw.strip())

    lim_list = []
    for part in parts:
        m = re.match(r'(\d+)\.\s*(.*)', part, flags=re.S)
        if not m:
            continue
        num  = int(m.group(1))
        text = m.group(2).strip()
        lim_list.append([num, text])

    df.at[row_idx, 'Lim_and_OR_ground_truth_list'] = lim_list


In [None]:
# making combinations from 'ground truth' and llm generated text'

df['combined'] = [[] for _ in range(len(df))]

# Generate combinations for each row
for i in range(len(df)):
    combined_list = []
    list1 = df["Lim_and_OR_ground_truth_list"][i]
    list2 = df["limitations_merged_final_list"][i]

    # Generate all possible combinations
    for item1 in list1:
        for item2 in list2:
            combined_list.append((item1, item2))

    # Store the first 100 combinations (or all if fewer)
    df.at[i, 'combined'] = combined_list  # Truncate if needed

In [None]:
import os
import base64
import time
import pandas as pd
# from openai import AzureOpenAI, RateLimitError

import os
import time
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()

# Now your batch‐processing loop:
all_generated_summary = []
start_time = time.time()

import json

llm_results = []
df['LLM_eval'] = ''
for idx in range(len(df)): # len(df)
    print("idx is",idx)
    pairs = df.at[idx, 'combined']   # assume this is List[Tuple[list, list]]
    if not isinstance(pairs, list) or not pairs:
        llm_results.append(None)
        continue

    # build the named-pairs block in one go
    formatted = "\n".join(
        f"Pair {i+1}:\n  List1: {first}\n  List2: {second}"
        for i, (first, second) in enumerate(pairs)
    )

    prompt = (
        "For each of the following pairs, answer “Yes” if List1 contains a topic or limitation\n"
        "from List2, or List2 contains a topic or limitation from from List1; otherwise answer “No”.\n"
        "Respond *only* with a JSON object mapping each Pair name to “Yes” or “No”.\n\n"
        "Pairs:\n"
        f"{formatted}"
    )

    # single call per row
    resp_text = run_critic_openai(prompt)
    llm_results.append(resp_text)

    df.at[idx, 'LLM_eval'] = resp_text


In [None]:
import re
# extract all 'Yes', 'No'
pattern = r'"Pair\s*\d+"\s*:\s*"(Yes|No)"'

all_matches = []
for idx in range(len(df)):
    raw = df.at[idx, 'LLM_eval']
    if not isinstance(raw, str):
        all_matches.append([])
        continue
    matches = re.findall(pattern, raw)
    all_matches.append(matches)


In [None]:
import pandas as pd

rows = []
for idx, tuples in df['combined'].items():
    if not isinstance(tuples, list):
        continue
    # get the matching list for this row
    matches = all_matches[idx] if idx < len(all_matches) else []

    for j, (list1, list2) in enumerate(tuples):
        # grab the j-th match or None if out of range
        is_match = matches[j] if j < len(matches) else None

        rows.append({
            'source_row': idx,
            'List1':      list1,
            'List2':      list2,
            'is_match':   is_match
        })

result_df = pd.DataFrame(rows)

result_df.rename(
    columns={
        'List1': 'Ground_Truth',
        'List2': 'LLM_generated'
    },
    inplace=True
)


In [None]:
import re

def extract_leading_number(x):
    """
    If x is a list, grab its first element; then:
    • If it’s an int, return it.
    • If it’s a string starting with digits (with or without a dot), return those digits.
    Otherwise return None.
    """
    # step 1: if it’s a list, pull out the first item
    val = x[0] if isinstance(x, list) and x else x

    # step 2: if it’s already an int, just return it
    if isinstance(val, int):
        return val

    # step 3: if it’s a string, regex for leading digits
    if isinstance(val, str):
        # match “123.” or just “123”
        m = re.match(r'^\s*(\d+)(?:\.)?', val)
        if m:
            return int(m.group(1))

    return None

# extract into new columns
result_df['gt_number']        = result_df['Ground_Truth'].apply(extract_leading_number)
result_df['llm_gen_number']   = result_df['LLM_generated'].apply(extract_leading_number)


In [None]:
# ground truth coverage

# Initialize variables
current_section = None
section_has_yes = False
match = 0

# Iterate through the DataFrame
for index, row in result_df.iterrows():
    # Check if we are still in the same section
    if row['gt_number'] == current_section:
        # Check if there is a 'Yes' in 'is_match'
        if row['is_match'] == 'Yes':
            section_has_yes = True
    else:
        # We've reached a new section, check if the last section had a 'Yes'
        if section_has_yes:
            match += 1
        # Reset for new section
        current_section = row['gt_number']
        section_has_yes = (row['is_match'] == 'Yes')

# Check the last section after exiting the loop
if section_has_yes:
    match += 1
print(match)


# total number of unique ground truth

# Calculate consecutive blocks where 'ground_truth' is the same
unique_blocks = result_df['Ground_Truth'].ne(result_df['Ground_Truth'].shift()).cumsum()

# Group by these blocks and count each group
ck = result_df.groupby(unique_blocks)['gt_number'].agg(['count'])

# Output the results
print("Number of unique consecutive 'ground_truth' texts and their counts:")
print(ck)


Measuring Quality bewtween matched pairs (NLP based metrics)

Ground Truth: ground_truth || LLM_Generated limitation: llm_generated

In [None]:
# say you want to rename 'oldA'→'newA' and 'oldB'→'newB'
df.rename(columns={
    'Ground_Truth': 'ground_truth',
    'LLM_generated': 'llm_generated',
    # 'Is_same': 'is_match',
}, inplace=True)

In [None]:
# Drop rows where the column 'is_match' is 'no'
df_filtered = df[df['is_match'].str.lower() != 'no']

In [None]:
df_filtered = df_filtered.reset_index(drop=True)

In [None]:
# BERTScore (all)
!pip3 -q install bert-score


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

BERTScor for whole texts

In [None]:
import pandas as pd
from bert_score import BERTScorer

# Initialize the BERT scorer
scorer = BERTScorer(model_type='roberta-large', lang="en")

# Function to calculate BERTScore for each row using one loop
def calculate_bertscore(row):
    # Calculate BERT Scores directly for the ground_truth and llm_generated of the row
    _, _, F1 = scorer.score([row['ground_truth']], [row['llm_generated']])
    return F1.mean().item()  # Return the mean F1 score

# Apply the function to each row in the DataFrame
df_filtered['bert_score'] = df_filtered.apply(calculate_bertscore, axis=1)


In [None]:
# Calculate the average of the 'bert_score' column in df_highest_score
average_bert_score = df_filtered['bert_score'].mean()

# Display the average
average_bert_score


In [None]:
!pip3 -q install rouge_score

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import pandas as pd

# Initialize ROUGE scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate similarity metrics for each row
def calculate_metrics(row):
    metrics = {}

    # ROUGE scores
    rouge_scores = rouge_scorer.score(row['ground_truth'], row['llm_generated'])
    metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
    metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
    metrics['rougeL'] = rouge_scores['rougeL'].fmeasure

    # Cosine Similarity
    vectorizer = CountVectorizer().fit_transform([row['ground_truth'], row['llm_generated']])
    vectors = vectorizer.toarray()
    metrics['cosine_similarity'] = cosine_similarity(vectors)[0, 1]

    # Jaccard Similarity
    set1 = set(row['ground_truth'].split())
    set2 = set(row['llm_generated'].split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    metrics['jaccard_similarity'] = intersection / union if union > 0 else 0

    # BLEU Score
    metrics['bleu_score'] = sentence_bleu([row['ground_truth'].split()], row['llm_generated'].split())

    return metrics

# Apply the function to each row in the DataFrame and store results in new columns
metric_results = df_filtered.apply(calculate_metrics, axis=1)

# Expand the dictionary into separate columns
metric_results_df = pd.DataFrame(metric_results.tolist())
df_filtered = pd.concat([df_filtered, metric_results_df], axis=1)


In [None]:
# Calculate the average of each metric
average_metrics = {
    'Average ROUGE-1': df_filtered['rouge1'].mean(),
    'Average ROUGE-2': df_filtered['rouge2'].mean(),
    'Average ROUGE-L': df_filtered['rougeL'].mean(),
    'Average Cosine Similarity': df_filtered['cosine_similarity'].mean(),
    'Average Jaccard Similarity': df_filtered['jaccard_similarity'].mean(),
    'Average BLEU Score': df_filtered['bleu_score'].mean()
}

# Print the average metrics
average_metrics


Topic similarity

In [None]:
!pip3 -q install keybert

from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT()

# Ensure all entries are strings (even if NaN)
df_filtered['ground_truth'] = df_filtered['ground_truth'].fillna("").astype(str)
df_filtered['llm_generated'] = df_filtered['llm_generated'].fillna("").astype(str)

# Function to extract keywords using KeyBERT
def extract_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english')
    return [kw[0] for kw in keywords]  # Extract just the keywords

# Apply KeyBERT to 'ground_truth' and 'LLM_generated' columns
df_filtered['ground_truth_words'] = df_filtered['ground_truth'].apply(extract_keywords)
df_filtered['LLM_generated_words'] = df_filtered['llm_generated'].apply(extract_keywords)


In [None]:
# Function to compute Jaccard Similarity
def jaccard_similarity(row):
    set1 = set(row['ground_truth_words'])
    set2 = set(row['LLM_generated_words'])
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

# Apply Jaccard Similarity to each row
df_filtered['jaccard_similarity_topic'] = df_filtered.apply(jaccard_similarity, axis=1)
df_filtered['jaccard_similarity_topic'].mean()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute Cosine Similarity with empty check
def cosine_sim(row):
    gt = ' '.join(row['ground_truth_words'])
    llm = ' '.join(row['LLM_generated_words'])

    # If either is empty, return 0 similarity
    if not gt.strip() or not llm.strip():
        return 0.0

    try:
        vectorizer = CountVectorizer().fit_transform([gt, llm])
        vectors = vectorizer.toarray()
        return cosine_similarity(vectors)[0, 1]
    except ValueError:
        return 0.0  # fallback if vocabulary is still empty

df_filtered['cosine_similarity_topic'] = df_filtered.apply(cosine_sim, axis=1)
mean_sim = df_filtered['cosine_similarity_topic'].mean()
print(mean_sim)


In [None]:
import re

# Function to extract text between double asterisks
def extract_text_between_asterisks(text):
    matches = re.findall(r'\*\*(.*?)\*\*', text)
    return matches

# Apply the function to both columns and store results in new columns
df_filtered['ground_truth_extracted'] = df_filtered['ground_truth'].apply(extract_text_between_asterisks)
df_filtered['llm_generated_extracted'] = df_filtered['llm_generated'].apply(extract_text_between_asterisks)
