### LLM agents to generate limitations (Agents: Extractor, Analyzer, Reviewer, Synthesizer)

In [None]:
import os
from openai import OpenAI
import time

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Define which sections to pull in for every row
SECTIONS = ["df_Abstract", "df_Introduction", "df_Related_Work",
            "df_Methodology", "df_Dataset", "df_Experiment_and_Results", "df_Conclusion"]

# Define a helper that streams one prompt and returns the full text
def run_critic(prompt: str) -> str:
    summary = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary += chunk.choices[0].delta.content or ""
    return summary.strip()

generated_limitations = []

for idx in range(len(df_lim)):
    # build a list of “Section:\ncontent” strings
    pieces = []
    for sec in SECTIONS:
        col = sec.replace(" ", "_")    # e.g. "Related Work" → "Related_Work"
        if pd.notna(df_lim.loc[idx, col]):
            pieces.append(f"{sec}:\n{df_lim.loc[idx, col]}")

    text_blob = "\n\n".join(pieces)

    extractor_agent = run_critic(
        '''You are an expert in scientific literature analysis. Your task is to carefully read the provided scientific article and
        extract all explicitly stated limitations as mentioned by the authors. Focus on sections such as the Discussion, Conclusion.
        List each limitation verbatim, including direct quotes where possible, and provide
        a brief context (e.g., what aspect of the study the limitation pertains to). Ensure accuracy and avoid inferring or adding
        limitations not explicitly stated. If no limitations are mentioned, state this clearly. Output your findings in a structured
        format with bullet points.\n\n'''
        + text_blob
    )
    analyzer_agent = run_critic(
        '''You are a critical scientific reviewer with expertise in research methodology and analysis. Your task is to analyze the
        provided scientific article and identify potential limitations that are not explicitly stated by the authors. Focus on aspects
        such as study design, sample size, data collection methods, statistical analysis, scope of findings, and underlying assumptions.
        For each inferred limitation, provide a clear explanation of why it is a limitation and how it impacts the study’s validity,
        reliability, or generalizability. Ensure your inferences are grounded in the article’s content and avoid speculative assumptions.
        Output your findings in a structured format with bullet points, including a brief justification for each limitation.\n\n'''
        + text_blob
    )
    reviewer_agent = run_critic(
        '''You are an expert in open peer review with a focus on transparent and critical evaluation of scientific research. Your task
        is to review the provided scientific article from the perspective of an external peer reviewer. Identify potential limitations
        that might be raised in an open review process, considering common critiques such as reproducibility, transparency,
        generalizability, or ethical considerations. If possible, leverage insights from similar studies or common methodological
        issues in the field (search the web or X posts if needed for context). For each limitation, explain why it would be a
        concern in an open review and how it aligns with peer review standards. Output your findings in a structured format with
        bullet points, ensuring each limitation is relevant to the article’s content.:\n\n'''
        + text_blob
    )

    # 3) master coordinator to fuse them
    coord_prompt = (
    '''You are a **Master Coordinator**. You are an expert in scientific communication and synthesis.
    Your task is to integrate the limitations provided by three other agents:
    1. The **Extractor** (explicit limitations from the article),
    2. The **Analyzer** (inferred limitations from critical analysis),
    3. The **Reviewer** (limitations from an open review perspective).

    Your goals are to:
    1. Combine all limitations into a cohesive list, removing redundancies.
    2. Ensure each limitation is clearly stated, scientifically valid, and aligned with the article’s content.
    3. Prioritize limitations explicitly mentioned by the authors, supplementing them with inferred or peer-review-based limitations only if they add value.
    4. Highlight discrepancies between the agents’ outputs and resolve them by cross-referencing the article.
    5. Format the final list in a clear, concise, and professional manner, suitable for inclusion in a scientific review or report.

    Provide a brief justification for each limitation, noting its source:
    - "Author-stated" (from Extractor),
    - "Inferred" (from Analyzer),
    - "Peer-review-derived" (from Reviewer).

    You will be given three lists of sub-limitations:\n\n'''
    f"**Extractor Agent**:\n{extractor_agent}\n\n"
    f"**Analyzer Agent**:\n{analyzer_agent}\n\n"
    f"**Reviewer Agent**:\n{reviewer_agent}\n\n"
    )
    final_lim = run_critic(coord_prompt)

    generated_limitations.append({
        # "Experiment": experiments_lim,
        # "Clarity": clarity_lim,
        # "Impact": impact_lim,
        "final": final_lim
    })
    # time.sleep(1)  # to avoid hitting rate limits


In [None]:
# Extract the 'final' value from each dict and build a DataFrame
final_values = [d.get("final", "") for d in generated_limitations]
df_generated_limitations_2 = pd.DataFrame(final_values, columns=["generated_limitations_1"])

In [None]:
# convert list to string and split
def process_single_limitation(limitation_text: str) -> list[str]:
    """
    Split the text on '**' and return the segments
    that occur before each '**'.
    """
    parts = limitation_text.split("**")
    # parts at even indices (0,2,4,…) are the “previous” segments
    prev_texts = [
        part.strip()
        for idx, part in enumerate(parts)
        if idx % 2 == 0    # even indices
           and part.strip()  # non-empty
    ]
    return prev_texts

# Apply to your DataFrame column
df_generated_limitations_2["generated_limitations_1"] = (
    df_generated_limitations_2["generated_limitations_1"]
    .apply(process_single_limitation)
)


In [None]:
# convert string to list
import ast

# This will parse the string "[...]" into a real list object
df_generated_limitations_2['generated_limitations_1'] = (
    df_generated_limitations_2['generated_limitations_1']
      .astype(str)               # ensure it’s a string
      .apply(ast.literal_eval)   # safely evaluate Python literal
)


In [None]:
import ast
import pandas as pd

def enumerate_and_filter(cell):
    """
    Given a cell that is either:
      - A Python list of strings, or
      - A string repr of such a list,
    this will:
      1. turn it into a list of sublists,
      2. remove any sublist equal to ['-'],
      3. prefix each remaining sublist's string with its 1-based index,
      4. return a new list-of-lists.
    """
    # 1) Parse string repr if necessary
    if isinstance(cell, str):
        try:
            lst = ast.literal_eval(cell)
        except Exception:
            # not a literal list → treat the entire cell as one string
            lst = [cell]
    else:
        lst = cell

    # 2) Ensure list-of-lists
    lol = []
    for item in lst:
        if isinstance(item, list):
            lol.append(item)
        else:
            # assume it's a bare string
            lol.append([str(item)])

    # 3) Filter out ['-'] sublists
    filtered = [sub for sub in lol if not (len(sub) == 1 and sub[0].strip() == "-")]

    # 4) Enumerate: prefix each sublist’s only element with "i. "
    enumerated = [[f"{i+1}. {sub[0]}"] for i, sub in enumerate(filtered)]

    return enumerated

# Example usage on your DataFrame
df_generated_limitations_2['generated_limitations_1'] = (df_generated_limitations_2['generated_limitations_1'].apply(enumerate_and_filter))

# Remove the first sublist in each list-of-lists
df_generated_limitations_2['generated_limitations_1'] = (df_generated_limitations_2['generated_limitations_1']
    .apply(lambda lol: lol[1:] if isinstance(lol, list) and len(lol) > 0 else [])
)


In [None]:
def process_single_limitation(limitation_text):
    # Split into different limitations (separated by \n\n)
    limitations = limitation_text.split('\n\n')
    processed_limitations = []

    for limitation in limitations:
        # Remove numbering (e.g., "1. **Limited Literature Review**" → "**Limited Literature Review**")
        cleaned_limitation = limitation.split('. ', 1)[-1] if '. ' in limitation else limitation

        # Split into sentences (using '.')
        sentences = [s.strip() for s in cleaned_limitation.split('.') if s.strip()]

        if sentences:
            processed_limitations.append(sentences)

    return processed_limitations

# df_generated_limitations_2['generated_limitations_1'] = df_generated_limitations_2['generated_limitations_1'].apply(process_single_limitation)
df_lim['Lim_and_OR_ground_truth_final'] = df_lim['Lim_and_OR_ground_truth_final'].apply(process_single_limitation)

In [None]:
# add numbering of LLM generated limitations
def add_numbering_to_limitations(list_of_lists):
    if not isinstance(list_of_lists, list):
        return list_of_lists  # Skip if not a list

    numbered_list = []
    for idx, sublist in enumerate(list_of_lists, start=1):
        if sublist:  # Ensure sublist is not empty
            # Add numbering to the first element of the sublist
            numbered_sublist = [f"{idx}. {sublist[0]}"] + sublist[1:]
            numbered_list.append(numbered_sublist)
    return numbered_list

# Apply to the column (modifies existing column)
df_lim['Lim_and_OR_ground_truth_final'] = df_lim['Lim_and_OR_ground_truth_final'].apply(add_numbering_to_limitations)
# df_generated_limitations_2['generated_limitations_1']  = df_generated_limitations_2['generated_limitations_1'] .apply(add_numbering_to_limitations)

In [None]:
# organize numbers
def renumber_limitations(limitations_list):
    """
    Reorganizes numbered limitations to be sequential (1, 2, 3, ...)
    while preserving all other content.
    """
    if not isinstance(limitations_list, list):
        return limitations_list

    renumbered = []
    for i, sublist in enumerate(limitations_list, start=1):
        if isinstance(sublist, list) and len(sublist) > 0:
            # Process the first item in the sublist (where the number appears)
            first_item = sublist[0]

            # Remove existing numbering (e.g., "2. :" → ":")
            content = re.sub(r'^\d+\.\s*:\s*', '', first_item)

            # Add new numbering
            renumbered_item = f"{i}. : {content}"

            # Reconstruct the sublist with the renumbered first item
            new_sublist = [renumbered_item] + sublist[1:]
            renumbered.append(new_sublist)
        else:
            renumbered.append(sublist)  # Keep non-list or empty entries

    return renumbered

# Apply to the DataFrame column
df_generated_limitations_2["generated_limitations_1"] = df_generated_limitations_2["generated_limitations_1"].apply(
    lambda lst: renumber_limitations(lst) if isinstance(lst, list) else lst
)

In [None]:
import re

def remove_source_entries(entries):
    """
    Remove sublists where any string in the sublist contains
    'author-stated', 'inferred', or 'peer-review-derived' (case-insensitive).
    """
    if not isinstance(entries, list):
        return entries  # Return non-list entries as-is

    filtered = []
    keywords = ['author-stated', 'inferred', 'peer-review-derived']  # Lowercase for matching

    for sublist in entries:
        if isinstance(sublist, list):
            # Convert entire sublist to lowercase string for keyword search
            sublist_text = ' '.join(map(str, sublist)).lower()
            # Check if ANY keyword exists in the sublist text
            if not any(keyword in sublist_text for keyword in keywords):
                filtered.append(sublist)
        else:
            filtered.append(sublist)  # Keep non-list items

    return filtered

# Apply to DataFrame column
df_generated_limitations_2["generated_limitations_1"] = df_generated_limitations_2["generated_limitations_1"].apply(
    lambda lst: remove_source_entries(lst) if isinstance(lst, list) else lst
)

In [None]:
# remove future work
import re

def remove_future_entries(entries):
    """
    Given a list of lists (where each sub-list contains strings),
    return a new list omitting sublists where the first item contains
    'future' inside **double asterisks** (case-insensitive).
    """
    filtered = []
    for sublist in entries:
        if isinstance(sublist, list) and len(sublist) > 0:
            first_item = sublist[0]
            # Check if 'future' appears inside **...** (case-insensitive)
            if not re.search(r'\*\*.*future.*\*\*', first_item, re.IGNORECASE):
                filtered.append(sublist)
        else:
            filtered.append(sublist)  # Keep non-list entries as-is
    return filtered

# Apply to the DataFrame column
df_generated_limitations_2["generated_limitations_1"] = df_generated_limitations_2["generated_limitations_1"].apply(
    lambda lst: remove_future_entries(lst) if isinstance(lst, list) else lst
)


# remove future work
import re

def remove_future_entries(entries):
    """
    Given a list of lists (where each sub-list contains strings),
    return a new list omitting sublists where the first item contains
    'future' inside **double asterisks** (case-insensitive).
    """
    filtered = []
    for sublist in entries:
        if isinstance(sublist, list) and len(sublist) > 0:
            first_item = sublist[0]
            # Check if 'future' appears inside **...** (case-insensitive)
            if not re.search(r'\*\*.*future.*\*\*', first_item, re.IGNORECASE):
                filtered.append(sublist)
        else:
            filtered.append(sublist)  # Keep non-list entries as-is
    return filtered

# Apply to the DataFrame column
df_lim["Lim_and_OR_ground_truth_final"] = df_lim["Lim_and_OR_ground_truth_final"].apply(
    lambda lst: remove_future_entries(lst) if isinstance(lst, list) else lst
)

def remove_here_are_the(entries):
    """
    Given a list of lists (where each sub-list contains strings),
    return a new list omitting any sub-list that starts with "1. Here are the"
    (case-insensitive and whitespace-tolerant).
    """
    filtered = []
    for sublist in entries:
        if isinstance(sublist, list) and len(sublist) > 0:
            first_item = sublist[0].strip().lower()  # Clean whitespace and lowercase
            if not first_item.startswith("1. here are the"):
                filtered.append(sublist)
        else:
            filtered.append(sublist)  # Keep non-list entries as-is
    return filtered

# Apply to every row in the DataFrame
df_lim["Lim_and_OR_ground_truth_final"] = df_lim["Lim_and_OR_ground_truth_final"].apply(
    lambda lst: remove_here_are_the(lst) if isinstance(lst, list) else lst
)

In [None]:
# Initialize the 'combined' column with empty lists
df_lim['combined3'] = [[] for _ in range(len(df_lim))]  # Match the DataFrame's length

# Generate combinations for each row (up to 100 pairs per row)
for i in range(len(df_lim)):
    combined_list = []
    list1 = df_lim["Lim_and_OR_ground_truth_final"][i]
    list2 = df_generated_limitations_2["generated_limitations_1"][i]

    # Generate all possible combinations (item1 from list1, item2 from list2)
    for item1 in list1:
        for item2 in list2:
            combined_list.append((item1, item2))

    # Store only the first 100 combinations (or all if fewer than 100)
    df_lim.at[i, 'combined3'] = combined_list  # Truncate to 100 max

In [None]:
# at first, generate summary (index 0 to 412) (done)
import time
import os
from openai import OpenAI

start_time = time.time()
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

all_generated_summary = []

for i in range(len(df_lim)): # len(df_lim)
    generated_summary = []
    for description1, description2 in df_lim['combined3'][i]:
      prompt = '''Check whether 'list2' contains a topic or limitation from 'list1' or 'list1' contains a topic or limitation from 'list2'.
      Your answer should be "Yes" or "No" \n. List 1:''' + str(description1) + "List2: " + str(description2)
      summary_text = ""  # Initialize an empty string to collect the limitation text
      stream = client.chat.completions.create(
          # model="gpt-3.5-turbo",
          model="gpt-4o-mini",
          messages=[
              {
                  "role": "user",
                  "content": prompt,
              }
          ],
          stream=True,
          temperature=0  # Adjust the temperature as needed, max_tokens=150
      )

      for chunk in stream:
          summary_chunk = chunk.choices[0].delta.content or ""
          # print(limitation_chunk, end="")
          summary_text += summary_chunk  # Append each chunk to the limitation_text

      # print("\n")  # Print a newline for readability
      summary_chunks = []
      summary_chunks.append(summary_text)

      generated_summary.append((summary_chunks, "list1", description1, "list2", description2))
    all_generated_summary.append(generated_summary)
    # time.sleep(1)

end_time = time.time()
print(f"Total runtime: {end_time - start_time:.2f} seconds")

In [None]:
# ground truth coverage
data = []
row_num = 1  # Start row_num from 1, increment for each sublist

# Extract data from nested_list2
for sublist in all_generated_summary:
    for is_match, list1_label, ground_truth, list2_label, llm_generated in sublist:
        # Each tup is in the form of (list1, s1, s2, s3, s4)
        # Append data to list as a dictionary to maintain column order
        data.append({
            'row_num': row_num,
            'is_match': is_match[0],
            'ground_truth': ground_truth,
            'llm_generated': llm_generated
        })
    row_num += 1  # Increment row_num for each new sublist

# Create DataFrame from the list of dictionaries
df4 = pd.DataFrame(data)
df4.to_csv("df_neurips_llm_agents.csv",index=False)


In [None]:
import re

# Update the function to handle lists in each row
def extract_first_number_from_list(row):
    for text in row:  # Iterate through each string in the list
        match = re.match(r'^(\d+)', text)
        if match:
            return int(match.group(1))
    return None  # Return None if no number is found

# Apply the updated function to the 'ground_truth' column
df4['section'] = df4['ground_truth'].apply(extract_first_number_from_list)

# Initialize variables
current_section = None
section_has_yes = False
ck = 0

# Iterate through the DataFrame
for index, row in df4.iterrows():
    # Check if we are still in the same section
    if row['section'] == current_section:
        # Check if there is a 'Yes' in 'is_match'
        if row['is_match'] == 'Yes':
            # print("row_num",row["row_num"])
            # print("is_match", row["is_match"])
            # print("section", row['section'])
            # print("order", row['order'])
            section_has_yes = True
    else:
        # We've reached a new section, check if the last section had a 'Yes'
        if section_has_yes:
            ck += 1
        # Reset for new section
        current_section = row['section']
        section_has_yes = (row['is_match'] == 'Yes')

# Check the last section after exiting the loop
if section_has_yes:
    ck += 1
print(ck)



# total number of unique ground truth

# Calculate consecutive blocks where 'ground_truth' is the same
unique_blocks = df4['ground_truth'].ne(df4['ground_truth'].shift()).cumsum()

# Group by these blocks and count each group
group_counts = df4.groupby(unique_blocks)['ground_truth'].agg(['count'])

# Output the results
print("Number of unique consecutive 'ground_truth' texts and their counts:")
print(group_counts)


3195
Number of unique consecutive 'ground_truth' texts and their counts:
              count
ground_truth       
1                17
2                17
3                17
4                17
5                17
...             ...
5518             20
5519             20
5520             20
5521             20
5522             20

[5522 rows x 1 columns]


In [None]:
3195/5522

0.5785947120608476