### Evaluation of Extractor Agent

In [None]:
import re

# Make sure the output column exists
df['extractor_agent_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "extractor_agent"]

    # Skip if missing or not a string
    if not isinstance(raw, str):
        df.at[row_idx, 'extractor_agent_list'] = []
        continue

    # Split based on bullet points (each limitation starts with "- **")
    parts = re.split(r'\n\s*-\s+\*\*', raw.strip())

    lim_list = []
    for part in parts:
        part = part.strip()
        if not part:
            continue

        # Add back the initial "**" since we removed it during split
        part = "**" + part

        # Extract title
        title_match = re.search(r'\*\*(.*?)\*\*:', part)
        title = title_match.group(1).strip() if title_match else "Unknown Title"

        # Extract quote
        quote_match = re.search(r'\*\*Quote\*\*:\s*"([^"]+)"', part)
        quote = quote_match.group(1).strip() if quote_match else "No quote found"

        # Extract context
        context_match = re.search(r'\*\*Context\*\*:\s*(.*?)(?:\n|$)', part, re.S)
        context = context_match.group(1).strip() if context_match else "No context found"

        lim_list.append([title, quote, context])

    df.at[row_idx, 'extractor_agent_list'] = lim_list


In [None]:
for i in range(len(df)):
    sublists = df.at[i, 'extractor_agent_list']

    if isinstance(sublists, list) and len(sublists) > 0:
        df.at[i, 'extractor_agent_list'] = sublists[1:]

In [None]:
import re

# ensure the output column exists
df['Ground_Truth_Lim_OPR_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "Ground_Truth_Lim_OPR"]

    # skip non-strings
    if not isinstance(raw, str):
        df.at[row_idx, 'Ground_Truth_Lim_OPR_list'] = []
        continue

    # Remove "Merged Limitations:" header if present
    raw = raw.replace("Merged Limitations:", "").strip()

    # Split by \n where numbered items begin
    parts = re.split(r'\n(?=\d+\.\s)', raw)

    lim_list = []
    for part in parts:
        m = re.match(r'(\d+)\.\s*(.*)', part.strip(), flags=re.S)
        if not m:
            continue
        num  = int(m.group(1))
        text = m.group(2).strip()
        lim_list.append([num, text])

    df.at[row_idx, 'Ground_Truth_Lim_OPR_list'] = lim_list


In [None]:
# making combinations from 'ground truth' and llm generated text'

df['combined'] = [[] for _ in range(len(df))]

# Generate combinations for each row
for i in range(len(df)):
    combined_list = []
    list1 = df["Ground_Truth_Lim_OPR_list"][i]
    list2 = df["extractor_agent_list"][i]

    # Generate all possible combinations
    for item1 in list1:
        for item2 in list2:
            combined_list.append((item1, item2))

    # Store the first 100 combinations (or all if fewer)
    df.at[i, 'combined'] = combined_list  # Truncate if needed

In [None]:
import os
import base64
import time
import pandas as pd

import os
import time
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()

# Now your batch‐processing loop:
all_generated_summary = []
start_time = time.time()

import json

llm_results = []
df['extractor_eval_new_gt'] = ''
for idx in range(len(df)): # len(df)
    print("idx is",idx)
    pairs = df.at[idx, 'combined']   # assume this is List[Tuple[list, list]]
    if not isinstance(pairs, list) or not pairs:
        llm_results.append(None)
        continue

    # build the named-pairs block in one go
    formatted = "\n".join(
        f"Pair {i+1}:\n  List1: {first}\n  List2: {second}"
        for i, (first, second) in enumerate(pairs)
    )

    prompt = (
        "For each of the following pairs, answer “Yes” if List1 contains a topic or limitation\n"
        "from List2, or List2 contains a topic or limitation from from List1; otherwise answer “No”.\n"
        "Respond *only* with a JSON object mapping each Pair name to “Yes” or “No”.\n\n"
        "Pairs:\n"
        f"{formatted}"
    )

    # single call per row
    resp_text = run_critic_openai(prompt)
    llm_results.append(resp_text)

    df.at[idx, 'extractor_eval_new_gt'] = resp_text



In [None]:
df.to_csv("df_neruips_21_22_final.csv",index=False)

In [None]:
import re
# extract all 'Yes', 'No'
pattern = r'"Pair\s*\d+"\s*:\s*"(Yes|No)"'

all_matches = []
for idx in range(len(df)):
    raw = df.at[idx, 'extractor_eval_new_gt']
    if not isinstance(raw, str):
        all_matches.append([])
        continue
    matches = re.findall(pattern, raw)
    all_matches.append(matches)


In [None]:
len(all_matches)

325

In [None]:
import pandas as pd

rows = []
for idx, tuples in df['combined'].items():
    if not isinstance(tuples, list):
        continue
    # get the matching list for this row
    matches = all_matches[idx] if idx < len(all_matches) else []

    for j, (list1, list2) in enumerate(tuples):
        # grab the j-th match or None if out of range
        is_match = matches[j] if j < len(matches) else None

        rows.append({
            'source_row': idx,
            'List1':      list1,
            'List2':      list2,
            'is_match':   is_match
        })

result_df = pd.DataFrame(rows)

result_df.rename(
    columns={
        'List1': 'Ground_Truth',
        'List2': 'LLM_generated'
    },
    inplace=True
)


In [None]:
# Initialize a counter and a variable to track the current gt_number
current_gt = None
counter = 1

# We'll update the values in a new list first
llm_gen_numbers = []

for idx, row in result_df.iterrows():
    gt = row['gt_number']

    if gt != current_gt:
        # New gt_number group, reset counter
        current_gt = gt
        counter = 1
    llm_gen_numbers.append(counter)
    counter += 1

# Assign the list back to the DataFrame
result_df['llm_gen_number'] = llm_gen_numbers
# Now result_df['gt_number_consecutive'] will be identical for any consecutive rows
# that share the same Ground_Truth.

In [None]:
import re

def extract_leading_number(x):
    """
    If x is a list, grab its first element; then:
    • If it’s an int, return it.
    • If it’s a string starting with digits (with or without a dot), return those digits.
    Otherwise return None.
    """
    # step 1: if it’s a list, pull out the first item
    val = x[0] if isinstance(x, list) and x else x

    # step 2: if it’s already an int, just return it
    if isinstance(val, int):
        return val

    # step 3: if it’s a string, regex for leading digits
    if isinstance(val, str):
        # match “123.” or just “123”
        m = re.match(r'^\s*(\d+)(?:\.)?', val)
        if m:
            return int(m.group(1))

    return None

# extract into new columns
result_df['gt_number']        = result_df['Ground_Truth'].apply(extract_leading_number)
result_df['llm_gen_number']   = result_df['LLM_generated'].apply(extract_leading_number)



In [None]:
result_df.to_csv("df_llm_as_a_judge_extactor_lim_or.csv",index=False)

In [None]:
# ground truth coverage

# Initialize variables
current_section = None
section_has_yes = False
match = 0

# Iterate through the DataFrame
for index, row in result_df.iterrows():
    # Check if we are still in the same section
    if row['gt_number'] == current_section:
        # Check if there is a 'Yes' in 'is_match'
        if row['is_match'] == 'Yes':
            section_has_yes = True
    else:
        # We've reached a new section, check if the last section had a 'Yes'
        if section_has_yes:
            match += 1
        # Reset for new section
        current_section = row['gt_number']
        section_has_yes = (row['is_match'] == 'Yes')

# Check the last section after exiting the loop
if section_has_yes:
    match += 1
print(match)


# total number of unique ground truth

# Calculate consecutive blocks where 'ground_truth' is the same
unique_blocks = result_df['Ground_Truth'].ne(result_df['Ground_Truth'].shift()).cumsum()

# Group by these blocks and count each group
ck = result_df.groupby(unique_blocks)['gt_number'].agg(['count'])

# Output the results
print("Number of unique consecutive 'ground_truth' texts and their counts:")
print(ck)


### Evaluation (faithfulness, soundness, importance), ext+anly+rev+cit (rel 8 then rel text with input)

In [None]:
import os
import time
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()


In [None]:
evaluation_prompt_template = '''
You are an expert reviewer. Evaluate the quality of the generated limitations based on the following three criteria: Faithfulness,
Soundness, and Importance. For each criterion, assign a score between 1 and 5 and provide a short justification.

Faithfulness = The generated limitations should accurately represent the paper’s content and findings, avoiding any introduction
of misinformation or contradictions to the original concepts, methodologies or results presented.
– 5 points: Perfect alignment with the original content and findings, with no misinformation or contradictions. Fully reflects the
paper’s concepts, methodologies, and results
accurately.
– 4 points: Mostly aligns with the original content but contains minor inaccuracies or slight
misinterpretations. These do not significantly
affect the overall understanding of the paper’s
concepts or results.
– 3 points: Generally aligns with the original
content but includes several minor inaccuracies or contradictions. Some elements may
not fully reflect the paper’s concepts or results,
though the overall understanding is mostly intact.
– 2 points: Noticeable misalignment with the
original content, with multiple inaccuracies
or contradictions that could mislead readers.
Some key aspects of the paper’s concepts or
results are misrepresented.
– 1 point: Introduces significant misalignment
by misrepresenting issues that do not exist in
the paper. Creates considerable misinformation and contradictions that distort the original
content, concepts, or results.

Soundness = The generated limitations should be detailed and specific, with suggestions or critiques that are practical, logically
coherent, and purposeful. It should clearly address relevant aspects of the paper and offer insights that can genuinely improve the
research.
– 5 points: Highly detailed and specific, with
practical, logically coherent, and purposeful
suggestions. Clearly addresses relevant aspects and offers insights that substantially improve the research.
– 4 points: Detailed and mostly specific, with
generally practical and logically sound suggestions. Addresses relevant aspects well but may
lack depth or novelty in some areas.
– 3 points: Detailed and specific but with some
issues in practicality or logical coherence. Suggestions are somewhat relevant and offer partial improvements.
– 2 points: Somewhat vague or lacking in specificity, with suggestions that have limited practicality or logical coherence. Addresses
relevant aspects only partially and provides minimal improvement.
– 1 point: Lacks detail and specificity, with impractical or incoherent suggestions. Fails to
effectively address relevant aspects or offer
constructive insights for improvement.

Importance =  The generated limitations should
address the most significant issues that impact the
paper’s main findings and contributions. They
should highlight key areas where improvements
or further research are needed, emphasizing their
potential to enhance the research’s relevance and
overall impact.
– 5 points: Addresses critical issues that substantially impact the paper’s findings and contributions. Clearly identifies major areas for
significant improvement or further research,
enhancing the research’s relevance and overall
impact.
– 4 points: Identifies meaningful issues that contribute to refining the paper’s findings and
methodology. While the impact is notable,
it does not reach the level of fundamentally
shaping future research directions.
– 3 points: Highlights important issues that offer some improvement to the current work but
do not significantly impact future research directions. Provides useful insights for refining
the paper but lacks broader implications for
further study.
– 2 points: Points out limitations with limited
relevance to the paper’s overall findings and
contributions. Suggestions offer marginal improvements but fail to address more substantial
gaps in the research.
– 1 point: Focuses on trivial issues, such as minor errors or overly detailed aspects. Does not
address substantive issues affecting the paper’s
findings or contributions, limiting its overall
relevance and impact.

Input:
Input Paper: [Input Paper]
LLM Generated Limitations: [LLM Generated Limitations]

Please evaluate the **Generated Limitations** based on the **Input Paper Content** and return your response strictly in the following JSON format:

Faithfulness: rating: , explanation:,
Soundness:    rating: explanation: ,
Importance:   rating: , explanation:

'''


In [None]:
# results = []

df['ext_analy_result'] = ''
for i in range(len(df)): # len(df)
    print("i is",i)
    input_text = df.at[i, 'response_string_neurips']
    generated_limitations = df.at[i, 'master_agent_ext_analy_rev']

    if pd.isna(input_text) or pd.isna(generated_limitations):
        results.append(None)
        continue

    prompt = evaluation_prompt_template.format(
        input_text=input_text.strip(),
        generated_limitations=generated_limitations.strip()
    )

    try:
        result = azure_run_critic(prompt)
    except Exception as e:
        print(f"Error at row {i}: {e}")
        result = None

    df.at[i, "ext_analy_result"] = result
    # results.append(result)

# df.to_csv("/media/ibrahim/Extreme SSD/Limitations Data/NeurIPS_new/evaluations/df_neurips_21_22_eval.csv",index=False)

In [None]:
import pandas as pd
import json
import re

# Initialize empty columns
df['faithfulness_score'] = None
df['soundness_score'] = None
df['importance_score'] = None

for idx, row in df.iterrows():
    val = row['ext_analy_result']

    if pd.isna(val):
        continue

    try:
        # Clean and parse JSON
        clean_json = re.sub(r'```json|```', '', val).strip()
        parsed = json.loads(clean_json)

        # Store ratings into new columns
        df.at[idx, 'faithfulness_score'] = parsed['Faithfulness']['rating']
        df.at[idx, 'soundness_score'] = parsed['Soundness']['rating']
        df.at[idx, 'importance_score'] = parsed['Importance']['rating']

    except Exception as e:
        print(f"Row {idx} failed to parse:", e)
        continue

avg_faith = df['faithfulness_score'].mean()
avg_sound = df['soundness_score'].mean()
avg_imp = df['importance_score'].mean()

print(f"Average Faithfulness: {avg_faith:.2f}")
print(f"Average Soundness:   {avg_sound:.2f}")
print(f"Average Importance:  {avg_imp:.2f}")


Average Faithfulness: 4.00
Average Soundness:   3.00
Average Importance:  3.99


In [None]:
evaluation_prompt_template = '''
You are an expert reviewer. I am providing the LLM-generated limitations, the ground truth limitations, and a score from an initial assessment.

Your job is to **review the initial rating**, compare the **LLM-generated limitations** against the **ground truth**, and adjust the
score **if necessary** for each of the following criteria: Faithfulness, Soundness, and Importance.

Each criterion should have a rating from 1 to 5 (see scale below), along with a short justification.

Faithfulness = The generated limitations should accurately represent the paper’s content and findings, avoiding any introduction
of misinformation or contradictions to the original concepts, methodologies or results presented.
– 5 points: Perfect alignment with the original content and findings, with no misinformation or contradictions. Fully reflects the
paper’s concepts, methodologies, and results
accurately.
– 4 points: Mostly aligns with the original content but contains minor inaccuracies or slight
misinterpretations. These do not significantly
affect the overall understanding of the paper’s
concepts or results.
– 3 points: Generally aligns with the original
content but includes several minor inaccuracies or contradictions. Some elements may
not fully reflect the paper’s concepts or results,
though the overall understanding is mostly intact.
– 2 points: Noticeable misalignment with the
original content, with multiple inaccuracies
or contradictions that could mislead readers.
Some key aspects of the paper’s concepts or
results are misrepresented.
– 1 point: Introduces significant misalignment
by misrepresenting issues that do not exist in
the paper. Creates considerable misinformation and contradictions that distort the original
content, concepts, or results.

Soundness = The generated limitations should be detailed and specific, with suggestions or critiques that are practical, logically
coherent, and purposeful. It should clearly address relevant aspects of the paper and offer insights that can genuinely improve the
research.
– 5 points: Highly detailed and specific, with
practical, logically coherent, and purposeful
suggestions. Clearly addresses relevant aspects and offers insights that substantially improve the research.
– 4 points: Detailed and mostly specific, with
generally practical and logically sound suggestions. Addresses relevant aspects well but may
lack depth or novelty in some areas.
– 3 points: Detailed and specific but with some
issues in practicality or logical coherence. Suggestions are somewhat relevant and offer partial improvements.
– 2 points: Somewhat vague or lacking in specificity, with suggestions that have limited practicality or logical coherence. Addresses relevant
aspects only partially and provides minimal
improvement.
– 1 point: Lacks detail and specificity, with impractical or incoherent suggestions. Fails to
effectively address relevant aspects or offer
constructive insights for improvement.

Importance = The generated limitations should
address the most significant issues that impact the
paper’s main findings and contributions. They
should highlight key areas where improvements
or further research are needed, emphasizing their
potential to enhance the research’s relevance and
overall impact.
– 5 points: Addresses critical issues that substantially impact the paper’s findings and contributions. Clearly identifies major areas for
significant improvement or further research,
enhancing the research’s relevance and overall
impact.
– 4 points: Identifies meaningful issues that contribute to refining the paper’s findings and
methodology. While the impact is notable,
it does not reach the level of fundamentally
shaping future research directions.
– 3 points: Highlights important issues that offer some improvement to the current work but
do not significantly impact future research directions. Provides useful insights for refining
the paper but lacks broader implications for
further study.
– 2 points: Points out limitations with limited
relevance to the paper’s overall findings and
contributions. Suggestions offer marginal improvements but fail to address more substantial
gaps in the research.
– 1 point: Focuses on trivial issues, such as minor errors or overly detailed aspects. Does not
address substantive issues affecting the paper’s
findings or contributions, limiting its overall
relevance and impact.

Input:
Ground Truth Limitations: {input_text}

LLM Generated Limitations: {generated_limitations}

Initial Scores:
- Faithfulness: {faith}
- Soundness: {sound}
- Importance: {imp}

Please evaluate the **LLM Generated Limitations** based on the **Ground Truth** and return your response strictly in the following JSON format:

Faithfulness: rating: , explanation:,
Soundness:    rating: , explanation:,
Importance:   rating: , explanation:
'''


In [None]:
import json
import re

# Initialize result columns
df['adjusted_score_ext_analy_rev_cit_with_rel_json'] = None  # Optional: for storing raw JSON string

for i in range(len(df)): # len(df)
    print("i is",i)
    try:
        input_text = df.at[i, 'Ground_Truth_Lim_OPR']
        generated = df.at[i, 'master_agent_ext_analy_rev']
        faith = df.at[i, 'faithfulness_score']
        sound = df.at[i, 'soundness_score']
        imp = df.at[i, 'importance_score']

        if pd.isna(input_text) or pd.isna(generated) or pd.isna(faith) or pd.isna(sound) or pd.isna(imp):
            continue

        prompt = evaluation_prompt_template.format(
            input_text=input_text.strip(),
            generated_limitations=generated.strip(),
            faith=int(faith),
            sound=int(sound),
            imp=int(imp)
        )

        result = azure_run_critic(prompt)
        df.at[i, 'adjusted_score_ext_analy_rev_cit_with_rel_json'] = result  # Optional: Store full JSON output

    except Exception as e:
        print(f"Row {i} failed: {e}")
        continue


In [None]:
import re

# Initialize new columns
df['adjusted_faithfulness_score'] = None
df['adjusted_soundness_score'] = None
df['adjusted_importance_score'] = None

# Define regex patterns for each score
faith_re = re.compile(r'"Faithfulness"\s*:\s*{\s*"rating"\s*:\s*(\d+)', re.DOTALL)
sound_re = re.compile(r'"Soundness"\s*:\s*{\s*"rating"\s*:\s*(\d+)', re.DOTALL)
imp_re   = re.compile(r'"Importance"\s*:\s*{\s*"rating"\s*:\s*(\d+)', re.DOTALL)

# Apply regex extraction row-wise
for i in range(len(df)):
    row = df.at[i, 'adjusted_score_ext_analy_rev_cit_with_rel_json']
    if pd.isna(row):
        continue

    # Clean text from triple backticks and newline artifacts
    cleaned = re.sub(r"```json|```", "", row).strip()

    # Extract values using regex
    faith_match = faith_re.search(cleaned)
    sound_match = sound_re.search(cleaned)
    imp_match   = imp_re.search(cleaned)

    if faith_match:
        df.at[i, 'adjusted_faithfulness_score'] = int(faith_match.group(1))
    if sound_match:
        df.at[i, 'adjusted_soundness_score'] = int(sound_match.group(1))
    if imp_match:
        df.at[i, 'adjusted_importance_score'] = int(imp_match.group(1))


In [None]:
avg_faith = df['adjusted_faithfulness_score'].mean()
avg_sound = df['adjusted_soundness_score'].mean()
avg_imp = df['adjusted_importance_score'].mean()

print(f"Average Faithfulness: {avg_faith:.2f}")
print(f"Average Soundness:   {avg_sound:.2f}")
print(f"Average Importance:  {avg_imp:.2f}")


Average Faithfulness: 3.10
Average Soundness:   3.15
Average Importance:  3.07


In [None]:
df.to_csv("df_neurips_21_22_eval.csv",index=False)