In [None]:
import pandas as pd
df = pd.read_csv("df_neruips_21_22_final.csv")

In [None]:
# Define the column names to concatenate
cols_to_concat = [
    "neurips_Abstract",
    "neurips_Introduction",
    "neurips_Related_Work",
    "neurips_Methodology",
    "neurips_Dataset",
    "neurips_Conclusion",
    "neurips_Experiment_and_Results",
    "neurips_Extra"
]

# Create a new column 'response_string_neurips' with labeled concatenation
def concat_with_labels(row):
    parts = []
    for col in cols_to_concat:
        if isinstance(row.get(col), str) and row[col].strip():
            label = col.replace("neurips_", "").replace("_", " ")
            parts.append(f"{label}: {row[col].strip()}")
    return "\n\n".join(parts)

df["response_string_neurips"] = df.apply(concat_with_labels, axis=1)

In [None]:
import os
import time
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()

import time

start = time.time()

In [None]:
import pandas as pd
import tiktoken
from openai import OpenAI
import os

# Tokenization setup
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
max_tokens = 128000

def truncate_to_max_tokens(text: str, max_length: int) -> str:
    tokens = encoding.encode(text)
    return encoding.decode(tokens[:max_length]) if len(tokens) > max_length else text

In [None]:
Extractor = '''You are an expert in scientific literature analysis. Your task is to carefully read the provided scientific article and
extract all explicitly stated limitations as mentioned by the authors. Focus on sections such as the Discussion, Conclusion.
List each limitation verbatim, including direct quotes where possible, and provide
a brief context (e.g., what aspect of the study the limitation pertains to). Ensure accuracy and avoid inferring or adding
limitations not explicitly stated. If no limitations are mentioned, state this clearly. Output your findings in a structured
format with bullet points.\n\n'''
Analyzer = '''You are a critical scientific reviewer with expertise in research methodology and analysis. Your task is to analyze the
        provided scientific article and identify potential limitations that are not explicitly stated by the authors. Focus on aspects
        such as study design, sample size, data collection methods, statistical analysis, scope of findings, and underlying assumptions.
        For each inferred limitation, provide a clear explanation of why it is a limitation and how it impacts the study’s validity,
        reliability, or generalizability. Ensure your inferences are grounded in the article’s content and avoid speculative assumptions.
        Output your findings in a structured format with bullet points, including a brief justification for each limitation.\n\n'''

Reviewer = '''You are an expert in open peer review with a focus on transparent and critical evaluation of scientific research. Your task
        is to review the provided scientific article from the perspective of an external peer reviewer. Identify potential limitations
        that might be raised in an open review process, considering common critiques such as reproducibility, transparency,
        generalizability, or ethical considerations. If possible, leverage insights from similar studies or common methodological
        issues in the field (search the web or X posts if needed for context). For each limitation, explain why it would be a
        concern in an open review and how it aligns with peer review standards. Output your findings in a structured format with
        bullet points, ensuring each limitation is relevant to the article’s content.:\n\n'''

Citation = ''' You are an expert scientific research assistant tasked with generating limitations for a scientific article based on information from its cited papers. Use the provided cited paper information as broader context to inform your analysis. Your goal is to identify limitations for the current paper, focusing on shortcomings that align with the findings, methods, or scope of the cited papers. Prioritize limitations that could strengthen the current paper’s discussion or guide future research. Ensure limitations are concise, scientifically grounded, and tied to the cited papers’ context.

Workflow:
Plan: Identify relevant cited papers and plan how their findings/methods relate to the current article. Justify the selection based on potential gaps or differences.
Reasoning: Let’s think step by step to identify limitations:

Step 1: Review the cited papers to determine their relevance to the current article’s findings, methods, or scope.
Step 2: Use citation lookup tools to extract key details from cited papers (e.g., methodology, results).
Step 3: Identify gaps or differences between cited papers and the current article (e.g., advanced methods not adopted).
Step 4: Document each limitation, explaining how it stems from the cited papers and its relevance to the current study.
Step 5: Ensure all relevant cited papers are analyzed to capture potential limitations.
Analyze: Identify limitations based on gaps or differences between the cited papers and the current article, using tools to verify content.
Reflect: Ensure limitations are grounded in cited paper content and relevant to the current study. Re-evaluate overlooked papers if necessary.

Continue: Iterate until all relevant limitations are identified.

Output Format:
Bullet points listing each limitation.
For each: Description, explanation, and reference to the cited paper(s) in the format Paper Title.

Tool Use:
Use citation lookup tools to access and verify cited paper content.
Avoid assumptions; base limitations on retrieved cited paper information.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
“I selected [Paper X] because it uses a more robust method than the current article.”
“The current article’s simpler method may limit accuracy compared to [Paper X].”
“I reviewed all cited papers to ensure no relevant gaps were missed.”
This narrative ensures transparency and justifies each identified limitation. '''

In [None]:
# regeenrate
Regenerate_PROMPT = '''

You are tasked with generating limitations based on feedback from the Judge Agent.
Feedback Structure:
Strengths: [strengths]
Issues: [issues]
Suggestions: [suggestions]
Task: Create a set of limitations that:

Builds upon the identified strengths to reinforce positive aspects.
Minimizes the impact of issues by addressing them constructively.
Incorporates suggestions to ensure actionable improvements.
Ensure the limitations are clear, concise, and aligned with your role as [specify role, e.g., a content generator, analyst, etc.].'''



In [None]:
# lim gen
for i in range(len(df)): # len(df)
    print(f"\nProcessing row {i}")
    extractor_input = extractor_prompt + df['response_string_neurips'][i]
    analyzer_input = analyzer_prompt + df['response_string_neurips'][i]
    reviewer_input = reviewer_prompt + df['response_string_neurips'][i]

    extractor_agent = azure_run_critic(extractor_input)
    analyzer_agent = azure_run_critic(analyzer_input)
    reviewer_agent = azure_run_critic(reviewer_input)

    df.at[i, "extractor_agent"]  = extractor_agent
    df.at[i, "analyzer_agent"]  = analyzer_agent
    df.at[i, "reviewer_agent"]  = reviewer_agent


In [None]:
# citation agent
import re
import pandas as pd
import tiktoken

# Tokenization setup
encoding   = tiktoken.encoding_for_model("gpt-4o-mini")
max_tokens = 128000

def truncate_to_max_tokens(text: str, max_length: int) -> str:
    tokens = encoding.encode(text)
    return encoding.decode(tokens[:max_length]) if len(tokens) > max_length else text

# Make sure the output column exists
df['citation_agent_in_by_8'] = ''

# Process each row
for i in range(len(df)):
    print(f"Processing row {i}...")
    row = df.iloc[i]

    # 1) Collect all items from relevance_8_cited_in
    cited_in_list = row.get('relevance_8_cited_in', []) or []
    cited_in_texts = []
    for itm in cited_in_list:
        # if it's a dict with 'text' key, grab that, otherwise str(itm)
        if isinstance(itm, dict) and 'text' in itm:
            cited_in_texts.append(itm['text'])
        else:
            cited_in_texts.append(str(itm))

    # 2) Collect all items from relevance_8_cited_by
    cited_by_list = row.get('relevance_8_cited_by', []) or []
    cited_by_texts = []
    for itm in cited_by_list:
        if isinstance(itm, dict) and 'text' in itm:
            cited_by_texts.append(itm['text'])
        else:
            cited_by_texts.append(str(itm))

    # 3) Build the combined prompt section
    cited_in_block = "\n".join(cited_in_texts)
    cited_by_block = "\n".join(cited_by_texts)

    combined_cited_input = (
        "Referenced papers:\n" + cited_in_block +
        "\n\nPapers who cited this paper:\n" + cited_by_block
    )

    input_paper = df['response_string_neurips'][i]
    prompt = (
        "You are an assistant tasked to generate limitations or shortcomings "
        "in a scientific article. Below is the input paper:\n"
        f"{input_paper}\n\n"
        " Below is the relevant text from both the papers "
        "that this article cites and those that cite it.\n\n"
        f"{combined_cited_input}\n\n"
        "Please generate limitations based on this information."
    )

    # 5) Truncate and call LLM
    truncated = truncate_to_max_tokens(prompt, max_tokens)
    try:
        llm_summary = azure_run_critic(truncated)
    except Exception as e:
        print(f"Error at row {i}: {e}")
        llm_summary = "ERROR"

    df.at[i, "citation_agent_in_by_8"] = llm_summary

df.to_csv("df_neruips_21_22_final.csv",index=False)

In [None]:
import os
import time
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()


In [None]:
COORDINATOR_PROMPT = '''
    You are a **Master Coordinator**, an expert in scientific communication and synthesis. Your task is to integrate limitations provided
    by four agents:
    1. **Extractor** (explicit limitations from the article),
    2. **Analyzer** (inferred limitations from critical analysis),
    3. **Reviewer** (limitations from an open review perspective),
    4. **Citation** (limitations based on cited papers).

    **Goals**:
    1. Combine all limitations into a cohesive, non-redundant list.
    2. Ensure each limitation is clearly stated, scientifically valid, and aligned with the article’s content.
    3. Prioritize author-stated limitations, supplementing with inferred, peer-review, or citation-based limitations if they add value.
    4. Resolve discrepancies between agents’ outputs by cross-referencing the article and cited papers, using tools to verify content.
    5. Format the final list in a clear, concise, and professional manner, suitable for a scientific review or report, with citations for external sources.

    **Workflow** (inspired by SYS_PROMPT_SWEBENCH):
    1. **Plan**: Outline how to synthesize limitations, identify potential redundancies, and resolve discrepancies.
    2. **Analyze**: Combine limitations, prioritizing author-stated ones, and verify alignment with the article.
    3. **Reflect**: Check for completeness, scientific rigor, and clarity; resolve discrepancies using article content or tools.
    4. **Continue**: Iterate until the list is comprehensive, non-redundant, and professionally formatted.

    **Output Format**:
    - Numbered list of final limitations.
    - For each: Clear statement, brief justification, and source in brackets (e.g., [Author-stated], [Inferred], [Peer-review-derived], [Cited-papers]).
    - Include citations for external sources (e.g., web/X posts, cited papers) in the format [Source Name](ID).
    **Tool Use**:
    - Use text extraction tools to verify article content.
    - Use citation lookup tools to cross-reference cited papers.
    - Use web/X search tools to resolve discrepancies involving external context.

    **Input**: '''


import pandas as pd

# taking the self-feedback if it exists otherwise acutal one
# def master_agent(extractor_text, analyzer_text, reviewer_text, citation_text):
def master_agent(extractor_text, analyzer_text,reviewer_text,citation_text):
    """
    Takes the outputs of the four specialized agents and produces
    the final coordinated limitations via a GPT call.
    """
    coord_prompt = (
        COORDINATOR_PROMPT
        + f"**Extractor Agent**:\n{extractor_text}\n\n"
        + f"**Analyzer Agent**:\n{analyzer_text}\n\n"
        + f"**Reviewer Agent**:\n{reviewer_text}\n\n"
        + f"**Citation Agent**:\n{citation_text}\n\n"
        # + f"**Image Agent**:\n{image_text}\n\n"
        + "Produce a single, numbered list of final limitations, noting each source in brackets."
    )
    return azure_run_critic(coord_prompt)

# Example: Create a new column to store the output
results = []

df['master_agent_ext_analy_rev_cit_with_rel'] = ''

for i in range(len(df)): # len(df)
    print("i is",i)
    extractor_text = df.at[i, 'extractor_agent']
    analyzer_text  = df.at[i, 'analyzer_agent']
    reviewer_text  = df.at[i, 'reviewer_agent']
    citation_text  = df.at[i, 'selected_cited_text_related_with_input_lim']
    # image_text     = df_image.at[i, 'image_limitations']

    try:
        result = master_agent(extractor_text, analyzer_text, reviewer_text, citation_text)
        # result = master_agent(extractor_text, analyzer_text,reviewer_text)
    except Exception as e:
        print(f"Error at row {i}: {e}")
        result = "ERROR"

    df.at[i,'master_agent_ext_analy_rev_cit_with_rel'] = result
    results.append(result)

# Add results back to df
df.to_csv("df_neruips_21_22_final.csv",index=False)

### Judge and Self Feedback

In [None]:
JUDGE_PROMPT = ''' You are a Judge Agent, an expert in evaluating scientific text quality with a focus on limitation generation for
scientific articles. Your task is to assess the outputs of four agents—Extractor (explicit limitations from the article), Analyzer
(inferred limitations from critical analysis), Reviewer (peer-review limitations), and Citation (limitations based on cited papers).
For each agent’s output, assign a numerical score (0–100) and provide specific feedback based on defined criteria. The evaluation is
reference-free, relying on the output’s inherent quality and alignment with each agent’s role.

Evaluation Criteria:
Depth: How critical and insightful is the limitation? Does it reveal significant issues in the study’s design, findings, or implications?
(20% weight)



Originality: Is the limitation a generic critique or a novel, context-specific insight? (20% weight)

Actionability: Can researchers realistically address the limitation in future work? Does it provide clear paths for improvement?
(30% weight)

Topic Coverage: How broadly does the set of limitations cover relevant aspects (e.g., methodology, scope for Extractor/Analyzer; peer
review standards for Reviewer; cited paper gaps for Citation)? (30% weight)

Workflow: Plan: Review each agent’s role and expected output (Extractor: explicit limitations; Analyzer: inferred methodological gaps;
Reviewer: peer-review critiques; Citation: cited paper gaps). Identify tools (e.g., text analysis, citation lookup) to verify content
if needed.

Reasoning: Let’s think step by step to evaluate each output: Step 1: Read the agent’s output and confirm its alignment with the agent’s
role. Step 2: Assess each criterion (Depth, Originality, Actionability, Topic Coverage), noting strengths and weaknesses. Step 3: Assign
a score (0–10) for each criterion based on quality, then calculate the weighted total (0–100). Step 4: Generate feedback for each
criterion, specifying what was done well and what needs improvement. Step 5: Verify the evaluation by cross-checking with the article
or cited papers using tools, if necessary.

Analyze: Use tools to verify article or cited paper content to ensure accurate evaluation (e.g., confirm Extractor’s quotes, Citation’s
references). Reflect: Ensure the score and feedback are fair, consistent, and actionable. Re-evaluate if any criterion seems misjudged.
Continue: Iterate until the evaluation is complete for all agents.

Tool Use: Use text analysis tools to verify article content (e.g., Extractor’s quotes, Analyzer’s methodology). Use citation lookup
tools to confirm cited paper details (e.g., Citation’s references). Use web/X search tools to validate Reviewer’s external context,
if needed.

Chain of Thoughts: Document the evaluation process explicitly. For example: “The Extractor’s output identifies a limitation but
lacks critical insight, reducing Depth.” “The Analyzer’s limitation is generic, affecting Originality.” “The Reviewer’s output is
actionable but misses ethical considerations, limiting Topic Coverage.” This narrative ensures transparency and justifies the score
and feedback.

Scoring: For each criterion, assign a score (0–10) based on quality: 0–3: Poor (major issues, e.g., superficial, generic, not actionable,
narrow coverage). 4–6: Fair (moderate issues, e.g., somewhat insightful, partially actionable, incomplete coverage). 7–8: Good
(minor issues, e.g., mostly critical, slightly generic, broadly actionable). 9–10: Excellent (no issues, e.g., highly insightful,
novel, clearly actionable, comprehensive coverage).

Calculate the total score: Sum (criterion score × weight), where weights are Depth (0.2), Originality (0.2), Actionability (0.3),
Topic Coverage (0.3).

Example: Depth (8 × 0.2 = 1.6), Originality (7 × 0.2 = 1.4), Actionability (9 × 0.3 = 2.7), Topic Coverage (6 × 0.3 = 1.8),
Total = (1.6 + 1.4 + 2.7 + 1.8) × 10 = 75.

Input:
Extractor Agent: [extractor_agent output]
Analyzer Agent: [analyzer_agent output]
Reviewer Agent: [reviewer_agent output]
Citation Agent: [citation_agent output]

Output Format: The output must strictly be in JSON format, starting with ```json\n{...}.
For each agent (Extractor, Analyzer, Reviewer, Citation), provide a JSON object with the following structure:

{
  "agent": "[Agent Name]",
  "total_score": [Numerical score, 0–100],
  "evaluation": {
    "Depth": {
      "score": [0–10],
      "strengths": "[What was done well]",
      "issues": "[Problems identified]",
      "suggestions": "[How to improve]"
    },
    "Originality": {
      "score": [0–10],
      "strengths": "[What was done well]",
      "issues": "[Problems identified]",
      "suggestions": "[How to improve]"
    },
    "Actionability": {
      "score": [0–10],
      "strengths": "[What was done well]",
      "issues": "[Problems identified]",
      "suggestions": "[How to improve]"
    },
    "Topic_Coverage": {
      "score": [0–10],
      "strengths": "[What was done well]",
      "issues": "[Problems identified]",
      "suggestions": "[How to improve]"
    }
  }
}'''

In [None]:
import re
import json

def llm_assessment(agent_texts: dict,
                   agent_prompts: dict,
                   metrics=None):
    """
    Performs the LLM assessment (collective judge) to generate scores and feedback for each agent.
    If parsing fails, returns the raw LLM response in a third return value.

    Returns:
      combined: dict mapping agent name to parsed JSON evaluation data (empty if parse failed)
      row_scores: dict of per-agent score keys (scores or None)
      raw_response: the unparsed LLM output string
    """
    if metrics is None:
        metrics = ['Depth','Originality','Actionability','Topic_Coverage']

    # 1) Fire off the collective judge prompt
    raw_response = run_critic(
        JUDGE_PROMPT +
        "".join(f"**{agent} Agent**:\n{agent_texts[agent]}\n\n"
                for agent in agent_prompts) +
        JUDGE_PROMPT
    )
    # print("collective judge response:\n", raw_response)

    # 2) Extract JSON-fenced blocks
    blocks = re.findall(r"```json\n(.*?)```", raw_response, re.DOTALL)

    if not blocks:
        # No JSON blocks found at all → return empty combined and scores, plus raw text
        # print("⚠️ Warning: No JSON-fenced sections found in collective_judge.")
        return {}, {f"{agent}_score": None for agent in agent_prompts}, raw_response

    combined = {}
    for b in blocks:
        try:
            parsed = json.loads(b)
            agent_name = parsed.get("agent")
            if agent_name:
                combined[agent_name] = parsed
            # else:
            #     print("⚠️ Warning: JSON block missing 'agent' field:", b)
        except json.JSONDecodeError:
            # print("⚠️ Warning: Failed to parse JSON block:", b)
            # skip it
            pass

    # If combined is still empty, parsing failed entirely
    if not combined:
        # print("⚠️ Warning: Parsed no valid agent entries. Returning empty scores.")
        return {}, {f"{agent}_score": None for agent in agent_prompts}, raw_response

    # 3) Build row_scores from combined
    row_scores = {}
    for agent, data in combined.items():
        row_scores[f"{agent}_score"] = data.get("total_score")

    return combined, row_scores, raw_response


In [None]:
import time
import pandas as pd

# Ensure required columns exist
required_cols = ['extractor_agent', 'analyzer_agent', 'reviewer_agent', 'citation_agent']
all_generated_summary = []

start_time = time.time()

for idx, row in df.iterrows():
    print("idx",idx)
    extractor_output = row.get("extractor_agent", "")
    analyzer_output = row.get("analyzer_agent", "")
    reviewer_output = row.get("reviewer_agent", "")
    citation_output = row.get("citation_agent", "")

    # Prepare prompt with row-specific agent outputs
    prompt_filled = JUDGE_PROMPT.replace(
        "[extractor_agent output]", extractor_output
    ).replace(
        "[analyzer_agent output]", analyzer_output
    ).replace(
        "[reviewer_agent output]", reviewer_output
    ).replace(
        "[citation_agent output]", reviewer_output
    )

    try:
        result = azure_run_critic(prompt_filled)
    except Exception as e:
        result = f"ERROR: {e}"

    # df['LLM_feedback_resp'] = all_generated_summary
    df.at[idx, "LLM_feedback_resp"]  = result

    all_generated_summary.append(result)


In [None]:
import pandas as pd
import json
import re

flattened_rows = []

for idx, raw_text in df['LLM_feedback_resp'].items():
    # 1. Extract JSON block inside triple backticks
    blocks = re.findall(r"```json\n(.*?)```", str(raw_text), re.DOTALL)
    if not blocks:
        continue

    json_str = blocks[0].strip()

    try:
        # Try loading the JSON string
        parsed_json = json.loads(json_str)
    except json.JSONDecodeError:
        continue  # skip if parsing fails

    row_data = {"row": idx}

    # 2. Handle list of agents
    if isinstance(parsed_json, list):
        agents = parsed_json
    elif isinstance(parsed_json, dict):
        # dict with agent names as keys
        agents = parsed_json.values()
    else:
        continue

    # 3. Extract all fields from each agent
    for entry in agents:
        if not isinstance(entry, dict):
            continue

        agent = entry.get("agent")
        if not agent:
            continue

        row_data[f"{agent}_total_score"] = entry.get("total_score")

        evaluation = entry.get("evaluation", {})
        for metric, values in evaluation.items():
            for aspect in ["score", "strengths", "issues", "suggestions"]:
                col_name = f"{agent}_{metric}_{aspect}"
                row_data[col_name] = values.get(aspect)

    flattened_rows.append(row_data)

# Final DataFrame
df_flattened = pd.DataFrame(flattened_rows)


In [None]:
import pandas as pd

# ─── 1. Definitions ───

metrics = ["Depth", "Originality", "Actionability", "Topic_Coverage"]
agents  = ["Extractor", "Analyzer", "Reviewer", "Citation"]


AGENT_BASE_PROMPTS = {
    "Extractor": Extractor,
    "Analyzer":  Analyzer,
    "Reviewer":  Reviewer,
    "Citation":  Citation
}

# ─── 2. Ensure score columns are numeric ───
for agent in agents:
    for metric in metrics:
        col = f"{agent}_{metric}_score"
        df_flattened[col] = pd.to_numeric(df_flattened[col], errors="coerce")

# ─── 3. Initialize empty regenerated columns ───
for agent in agents:
    df_flattened[f"{agent}_regenerated"] = None

# ─── 4. Iterate over rows ───
for i, row in df_flattened.iterrows():
    print(f"Processing row {i}...")

    input_text = df["response_string_neurips"][i]
    # print("input text",input_text)
    # retrieved  = row.get("retrieved_text", "")
    # print("input text",input_text)

    for agent in agents:
        # print("yes")
        feedback_parts = []

        for metric in metrics:
            score_col = f"{agent}_{metric}_score"
            score = row.get(score_col, None)

            if pd.notna(score) and score < 5:
                strengths   = row.get(f"{agent}_{metric}_strengths", "")
                issues      = row.get(f"{agent}_{metric}_issues", "")
                suggestions = row.get(f"{agent}_{metric}_suggestions", "")

                feedback_parts.append(
                    f"{metric} Feedback:\n"
                    f"  Strengths: {strengths}\n"
                    f"  Issues: {issues}\n"
                    f"  Suggestions: {suggestions}"
                )
        # print("feedback_parts",feedback_parts)
        if feedback_parts:
            feedback_blob = "\n\n".join(feedback_parts)

            seed_text = retrieved + system_prompt if agent == "Citation" else input_text

            full_prompt = (
                AGENT_BASE_PROMPTS[agent]
                + seed_text
                + "\n\n"
                + Regenerate_PROMPT
                + "\n\n"
                + feedback_blob
            )
            # print("full prompt",full_prompt)
            try:
                regenerated = azure_run_critic(full_prompt)
                df.at[i, f"{agent}_regenerated"] = regenerated
                print("regenerated",regenerated)
            except Exception as e:
                print(f"⚠️ Error on row {i}, agent {agent}: {e}")
                df.at[i, f"{agent}_regenerated"] = f"ERROR: {e}"


### Ground Truth Coverage

Ground Truth: Lim_and_OR_ground_truth_final ||
LLM Generated Limitations: master_agent_ext_analy_rev_cit

In [None]:
# making lists of list 'master_agent' text
import re

# make sure the output column exists
df['master_agent_ext_analy_rev_cit_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "master_agent_ext_analy_rev_cit"]
    # skip if missing or not a string
    if not isinstance(raw, str):
        df.at[row_idx, 'master_agent_ext_analy_rev_cit_list'] = []
        continue

    # split on double-newline before a numbered item
    parts = re.split(r'\n\n(?=\d+\.)', raw.strip())

    lim_list = []
    for part in parts:
        m = re.match(r'(\d+)\.\s*(.*)', part, re.S)
        if not m:
            continue
        num  = int(m.group(1))
        text = m.group(2).strip()
        lim_list.append([num, text])

    df.at[row_idx, 'master_agent_ext_analy_rev_cit_list'] = lim_list


In [None]:
# making lists of list 'ground truth' text

import re

# ensure the output column exists
df['Lim_and_OR_ground_truth_list'] = None

for row_idx in range(len(df)):
    raw = df.at[row_idx, "Lim_and_OR_ground_truth_final"]
    # skip non-strings
    if not isinstance(raw, str):
        df.at[row_idx, 'Lim_and_OR_ground_truth_list'] = []
        continue

    # split on double-newline before a numbered item
    parts = re.split(r'\n\n(?=\d+\.)', raw.strip())

    lim_list = []
    for part in parts:
        m = re.match(r'(\d+)\.\s*(.*)', part, flags=re.S)
        if not m:
            continue
        num  = int(m.group(1))
        text = m.group(2).strip()
        lim_list.append([num, text])

    df.at[row_idx, 'Lim_and_OR_ground_truth_list'] = lim_list


In [None]:
# making combinations from 'ground truth' and llm generated text'

df['combined'] = [[] for _ in range(len(df))]

# Generate combinations for each row
for i in range(len(df)):
    combined_list = []
    list1 = df["Lim_and_OR_ground_truth_list"][i]
    list2 = df["master_agent_ext_analy_rev_cit_list"][i]

    # Generate all possible combinations
    for item1 in list1:
        for item2 in list2:
            combined_list.append((item1, item2))

    # Store the first 100 combinations (or all if fewer)
    df.at[i, 'combined'] = combined_list  # Truncate if needed

In [None]:
import os
import base64
import time
import pandas as pd
# from openai import AzureOpenAI, RateLimitError

import os
import time
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()

# Now your batch‐processing loop:
all_generated_summary = []
start_time = time.time()

import json

llm_results = []
df['LLM_eval_master_agent_ext_analy_rev_cit_list'] = ''
for idx in range(len(df)): # len(df)
    print("idx is",idx)
    pairs = df.at[idx, 'combined']   # assume this is List[Tuple[list, list]]
    if not isinstance(pairs, list) or not pairs:
        llm_results.append(None)
        continue

    # build the named-pairs block in one go
    formatted = "\n".join(
        f"Pair {i+1}:\n  List1: {first}\n  List2: {second}"
        for i, (first, second) in enumerate(pairs)
    )

    prompt = (
        "For each of the following pairs, answer “Yes” if List1 contains a topic or limitation\n"
        "from List2, or List2 contains a topic or limitation from from List1; otherwise answer “No”.\n"
        "Respond *only* with a JSON object mapping each Pair name to “Yes” or “No”.\n\n"
        "Pairs:\n"
        f"{formatted}"
    )

    # single call per row
    resp_text = run_critic_openai(prompt)
    llm_results.append(resp_text)

    df.at[idx, 'LLM_eval_master_agent_ext_analy_rev_cit_list'] = resp_text


In [None]:
import re
# extract all 'Yes', 'No'
pattern = r'"Pair\s*\d+"\s*:\s*"(Yes|No)"'

all_matches = []
for idx in range(len(df)):
    raw = df.at[idx, 'LLM_eval_master_agent_ext_analy_rev_cit_list']
    if not isinstance(raw, str):
        all_matches.append([])
        continue
    matches = re.findall(pattern, raw)
    all_matches.append(matches)


In [None]:
import pandas as pd

rows = []
for idx, tuples in df['combined'].items():
    if not isinstance(tuples, list):
        continue
    # get the matching list for this row
    matches = all_matches[idx] if idx < len(all_matches) else []

    for j, (list1, list2) in enumerate(tuples):
        # grab the j-th match or None if out of range
        is_match = matches[j] if j < len(matches) else None

        rows.append({
            'source_row': idx,
            'List1':      list1,
            'List2':      list2,
            'is_match':   is_match
        })

result_df = pd.DataFrame(rows)

result_df.rename(
    columns={
        'List1': 'Ground_Truth',
        'List2': 'LLM_generated'
    },
    inplace=True
)


In [None]:
import re

def extract_leading_number(x):
    """
    If x is a list, grab its first element; then:
    • If it’s an int, return it.
    • If it’s a string starting with digits (with or without a dot), return those digits.
    Otherwise return None.
    """
    # step 1: if it’s a list, pull out the first item
    val = x[0] if isinstance(x, list) and x else x

    # step 2: if it’s already an int, just return it
    if isinstance(val, int):
        return val

    # step 3: if it’s a string, regex for leading digits
    if isinstance(val, str):
        # match “123.” or just “123”
        m = re.match(r'^\s*(\d+)(?:\.)?', val)
        if m:
            return int(m.group(1))

    return None

# extract into new columns
result_df['gt_number']        = result_df['Ground_Truth'].apply(extract_leading_number)
result_df['llm_gen_number']   = result_df['LLM_generated'].apply(extract_leading_number)


In [None]:
# ground truth coverage

# Initialize variables
current_section = None
section_has_yes = False
match = 0

# Iterate through the DataFrame
for index, row in result_df.iterrows():
    # Check if we are still in the same section
    if row['gt_number'] == current_section:
        # Check if there is a 'Yes' in 'is_match'
        if row['is_match'] == 'Yes':
            section_has_yes = True
    else:
        # We've reached a new section, check if the last section had a 'Yes'
        if section_has_yes:
            match += 1
        # Reset for new section
        current_section = row['gt_number']
        section_has_yes = (row['is_match'] == 'Yes')

# Check the last section after exiting the loop
if section_has_yes:
    match += 1
print(match)


# total number of unique ground truth

# Calculate consecutive blocks where 'ground_truth' is the same
unique_blocks = result_df['Ground_Truth'].ne(result_df['Ground_Truth'].shift()).cumsum()

# Group by these blocks and count each group
ck = result_df.groupby(unique_blocks)['gt_number'].agg(['count'])

# Output the results
print("Number of unique consecutive 'ground_truth' texts and their counts:")
print(ck)


Measuring Quality bewtween matched pairs (NLP based metrics)

Ground Truth: ground_truth || LLM_Generated limitation: llm_generated

In [None]:
# say you want to rename 'oldA'→'newA' and 'oldB'→'newB'
df.rename(columns={
    'Ground_Truth': 'ground_truth',
    'LLM_generated': 'llm_generated',
    # 'Is_same': 'is_match',
}, inplace=True)

In [None]:
# Drop rows where the column 'is_match' is 'no'
df_filtered = df[df['is_match'].str.lower() != 'no']

In [None]:
df_filtered = df_filtered.reset_index(drop=True)

In [None]:
# BERTScore (all)
!pip3 -q install bert-score


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

BERTScor for whole texts

In [None]:
import pandas as pd
from bert_score import BERTScorer

# Initialize the BERT scorer
scorer = BERTScorer(model_type='roberta-large', lang="en")

# Function to calculate BERTScore for each row using one loop
def calculate_bertscore(row):
    # Calculate BERT Scores directly for the ground_truth and llm_generated of the row
    _, _, F1 = scorer.score([row['ground_truth']], [row['llm_generated']])
    return F1.mean().item()  # Return the mean F1 score

# Apply the function to each row in the DataFrame
df_filtered['bert_score'] = df_filtered.apply(calculate_bertscore, axis=1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Calculate the average of the 'bert_score' column in df_highest_score
average_bert_score = df_filtered['bert_score'].mean()

# Display the average
average_bert_score


np.float64(0.8640587552331024)

In [None]:
!pip3 -q install rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import pandas as pd

# Initialize ROUGE scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate similarity metrics for each row
def calculate_metrics(row):
    metrics = {}

    # ROUGE scores
    rouge_scores = rouge_scorer.score(row['ground_truth'], row['llm_generated'])
    metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
    metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
    metrics['rougeL'] = rouge_scores['rougeL'].fmeasure

    # Cosine Similarity
    vectorizer = CountVectorizer().fit_transform([row['ground_truth'], row['llm_generated']])
    vectors = vectorizer.toarray()
    metrics['cosine_similarity'] = cosine_similarity(vectors)[0, 1]

    # Jaccard Similarity
    set1 = set(row['ground_truth'].split())
    set2 = set(row['llm_generated'].split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    metrics['jaccard_similarity'] = intersection / union if union > 0 else 0

    # BLEU Score
    metrics['bleu_score'] = sentence_bleu([row['ground_truth'].split()], row['llm_generated'].split())

    return metrics

# Apply the function to each row in the DataFrame and store results in new columns
metric_results = df_filtered.apply(calculate_metrics, axis=1)

# Expand the dictionary into separate columns
metric_results_df = pd.DataFrame(metric_results.tolist())
df_filtered = pd.concat([df_filtered, metric_results_df], axis=1)


In [None]:
# Calculate the average of each metric
average_metrics = {
    'Average ROUGE-1': df_filtered['rouge1'].mean(),
    'Average ROUGE-2': df_filtered['rouge2'].mean(),
    'Average ROUGE-L': df_filtered['rougeL'].mean(),
    'Average Cosine Similarity': df_filtered['cosine_similarity'].mean(),
    'Average Jaccard Similarity': df_filtered['jaccard_similarity'].mean(),
    'Average BLEU Score': df_filtered['bleu_score'].mean()
}

# Print the average metrics
average_metrics


In [None]:
!pip3 -q install keybert

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

Topic similarity

In [None]:
from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT()

# Ensure all entries are strings (even if NaN)
df_filtered['ground_truth'] = df_filtered['ground_truth'].fillna("").astype(str)
df_filtered['llm_generated'] = df_filtered['llm_generated'].fillna("").astype(str)

# Now apply KeyBERT safely
# df_filtered['ground_truth_words'] = df_filtered['ground_truth'].apply(extract_keywords)
# df_filtered['LLM_generated_words'] = df_filtered['llm_generated'].apply(extract_keywords)


# Function to extract keywords using KeyBERT
def extract_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english')
    return [kw[0] for kw in keywords]  # Extract just the keywords

# Apply KeyBERT to 'ground_truth' and 'LLM_generated' columns
df_filtered['ground_truth_words'] = df_filtered['ground_truth'].apply(extract_keywords)
df_filtered['LLM_generated_words'] = df_filtered['llm_generated'].apply(extract_keywords)


In [None]:
# Function to compute Jaccard Similarity
def jaccard_similarity(row):
    set1 = set(row['ground_truth_words'])
    set2 = set(row['LLM_generated_words'])
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

# Apply Jaccard Similarity to each row
df_filtered['jaccard_similarity_topic'] = df_filtered.apply(jaccard_similarity, axis=1)
df_filtered['jaccard_similarity_topic'].mean()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute Cosine Similarity with empty check
def cosine_sim(row):
    gt = ' '.join(row['ground_truth_words'])
    llm = ' '.join(row['LLM_generated_words'])

    # If either is empty, return 0 similarity
    if not gt.strip() or not llm.strip():
        return 0.0

    try:
        vectorizer = CountVectorizer().fit_transform([gt, llm])
        vectors = vectorizer.toarray()
        return cosine_similarity(vectors)[0, 1]
    except ValueError:
        return 0.0  # fallback if vocabulary is still empty

df_filtered['cosine_similarity_topic'] = df_filtered.apply(cosine_sim, axis=1)
mean_sim = df_filtered['cosine_similarity_topic'].mean()
print(mean_sim)


In [None]:
import re

# Function to extract text between double asterisks
def extract_text_between_asterisks(text):
    matches = re.findall(r'\*\*(.*?)\*\*', text)
    return matches

# Apply the function to both columns and store results in new columns
df_filtered['ground_truth_extracted'] = df_filtered['ground_truth'].apply(extract_text_between_asterisks)
df_filtered['llm_generated_extracted'] = df_filtered['llm_generated'].apply(extract_text_between_asterisks)


### Evaluation: Faithfulness, Soundness, and Importance

In [None]:
evaluation_prompt_template = '''
You are an expert reviewer. Evaluate the quality of the generated limitations based on the following three criteria: Faithfulness,
Soundness, and Importance. For each criterion, assign a score between 1 and 5 and provide a short justification.

Faithfulness = The generated limitations should accurately represent the paper’s content and findings, avoiding any introduction
of misinformation or contradictions to the original concepts, methodologies or results presented.
– 5 points: Perfect alignment with the original content and findings, with no misinformation or contradictions. Fully reflects the
paper’s concepts, methodologies, and results
accurately.
– 4 points: Mostly aligns with the original content but contains minor inaccuracies or slight
misinterpretations. These do not significantly
affect the overall understanding of the paper’s
concepts or results.
– 3 points: Generally aligns with the original
content but includes several minor inaccuracies or contradictions. Some elements may
not fully reflect the paper’s concepts or results,
though the overall understanding is mostly intact.
– 2 points: Noticeable misalignment with the
original content, with multiple inaccuracies
or contradictions that could mislead readers.
Some key aspects of the paper’s concepts or
results are misrepresented.
– 1 point: Introduces significant misalignment
by misrepresenting issues that do not exist in
the paper. Creates considerable misinformation and contradictions that distort the original
content, concepts, or results.

Soundness = The generated limitations should be detailed and specific, with suggestions or critiques that are practical, logically
coherent, and purposeful. It should clearly address relevant aspects of the paper and offer insights that can genuinely improve the
research.
– 5 points: Highly detailed and specific, with
practical, logically coherent, and purposeful
suggestions. Clearly addresses relevant aspects and offers insights that substantially improve the research.
– 4 points: Detailed and mostly specific, with
generally practical and logically sound suggestions. Addresses relevant aspects well but may
lack depth or novelty in some areas.
– 3 points: Detailed and specific but with some
issues in practicality or logical coherence. Suggestions are somewhat relevant and offer partial improvements.
– 2 points: Somewhat vague or lacking in specificity, with suggestions that have limited practicality or logical coherence. Addresses
relevant aspects only partially and provides minimal improvement.
– 1 point: Lacks detail and specificity, with impractical or incoherent suggestions. Fails to
effectively address relevant aspects or offer
constructive insights for improvement.

Importance =  The generated limitations should
address the most significant issues that impact the
paper’s main findings and contributions. They
should highlight key areas where improvements
or further research are needed, emphasizing their
potential to enhance the research’s relevance and
overall impact.
– 5 points: Addresses critical issues that substantially impact the paper’s findings and contributions. Clearly identifies major areas for
significant improvement or further research,
enhancing the research’s relevance and overall
impact.
– 4 points: Identifies meaningful issues that contribute to refining the paper’s findings and
methodology. While the impact is notable,
it does not reach the level of fundamentally
shaping future research directions.
– 3 points: Highlights important issues that offer some improvement to the current work but
do not significantly impact future research directions. Provides useful insights for refining
the paper but lacks broader implications for
further study.
– 2 points: Points out limitations with limited
relevance to the paper’s overall findings and
contributions. Suggestions offer marginal improvements but fail to address more substantial
gaps in the research.
– 1 point: Focuses on trivial issues, such as minor errors or overly detailed aspects. Does not
address substantive issues affecting the paper’s
findings or contributions, limiting its overall
relevance and impact.

Input:
Input Paper: [Input Paper]
LLM Generated Limitations: [LLM Generated Limitations]

Please evaluate the **Generated Limitations** based on the **Input Paper Content** and return your response strictly in the following JSON format:

Faithfulness: rating: , explanation:,
Soundness:    rating: explanation: ,
Importance:   rating: , explanation:

'''


In [None]:
import os
import base64
import time
import pandas as pd
from openai import AzureOpenAI, RateLimitError
from openai import OpenAI

# Set up OpenAI API
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()


Measuring Faithfulness, Sundness, and Importance between Input text (response_string_neurips) and LLM Generated limitations (master_agent_ext_analy_rev_cit_with_rel)

In [None]:
# results = []

df['ext_analy_rev_cit_with_rel_result'] = ''
for i in range(len(df)): # len(df)
    print("i is",i)
    input_text = df.at[i, 'response_string_neurips']
    generated_limitations = df.at[i, 'master_agent_ext_analy_rev_cit_with_rel']

    if pd.isna(input_text) or pd.isna(generated_limitations):
        results.append(None)
        continue

    prompt = evaluation_prompt_template.format(
        input_text=input_text.strip(),
        generated_limitations=generated_limitations.strip()
    )

    try:
        result = azure_run_critic(prompt)
    except Exception as e:
        print(f"Error at row {i}: {e}")
        result = None

    df.at[i, "ext_analy_rev_cit_with_rel_result"] = result


Extracting the Faithfulness, Soundness, and Importance score

In [None]:
import pandas as pd
import json
import re

# Initialize empty columns
df['faithfulness_score'] = None
df['soundness_score'] = None
df['importance_score'] = None

for idx, row in df.iterrows():
    val = row['ext_analy_rev_cit_with_rel_result']

    if pd.isna(val):
        continue

    try:
        # Clean and parse JSON
        clean_json = re.sub(r'```json|```', '', val).strip()
        parsed = json.loads(clean_json)

        # Store ratings into new columns
        df.at[idx, 'faithfulness_score'] = parsed['Faithfulness']['rating']
        df.at[idx, 'soundness_score'] = parsed['Soundness']['rating']
        df.at[idx, 'importance_score'] = parsed['Importance']['rating']

    except Exception as e:
        print(f"Row {idx} failed to parse:", e)
        continue

avg_faith = df['faithfulness_score'].mean()
avg_sound = df['soundness_score'].mean()
avg_imp = df['importance_score'].mean()

print(f"Average Faithfulness: {avg_faith:.2f}")
print(f"Average Soundness:   {avg_sound:.2f}")
print(f"Average Importance:  {avg_imp:.2f}")


Measuring score with Ground truth (Lim_and_OR_ground_truth_final) and LLM Generated response (master_agent_ext_analy_rev_cit_with_rel) by providing score from previous response (input and LLM generated text)

In [None]:
import json
import re

# Initialize result columns
df['adjusted_score_ext_analy_rev_cit_with_rel_json'] = None  # Optional: for storing raw JSON string

for i in range(len(df)): # len(df)
    print("i is",i)
    try:
        input_text = df.at[i, 'Lim_and_OR_ground_truth_final']
        generated = df.at[i, 'master_agent_ext_analy_rev_cit_with_rel']
        faith = df.at[i, 'faithfulness_score']
        sound = df.at[i, 'soundness_score']
        imp = df.at[i, 'importance_score']

        if pd.isna(input_text) or pd.isna(generated) or pd.isna(faith) or pd.isna(sound) or pd.isna(imp):
            continue

        prompt = evaluation_prompt_template.format(
            input_text=input_text.strip(),
            generated_limitations=generated.strip(),
            faith=int(faith),
            sound=int(sound),
            imp=int(imp)
        )

        result = azure_run_critic(prompt)
        df.at[i, 'adjusted_score_ext_analy_rev_cit_with_rel_json'] = result  # Optional: Store full JSON output

    except Exception as e:
        print(f"Row {i} failed: {e}")
        continue



In [None]:
import re

# Initialize new columns
df['adjusted_faithfulness_score'] = None
df['adjusted_soundness_score'] = None
df['adjusted_importance_score'] = None

# Define regex patterns for each score
faith_re = re.compile(r'"Faithfulness"\s*:\s*{\s*"rating"\s*:\s*(\d+)', re.DOTALL)
sound_re = re.compile(r'"Soundness"\s*:\s*{\s*"rating"\s*:\s*(\d+)', re.DOTALL)
imp_re   = re.compile(r'"Importance"\s*:\s*{\s*"rating"\s*:\s*(\d+)', re.DOTALL)

# Apply regex extraction row-wise
for i in range(len(df)):
    row = df.at[i, 'adjusted_score_ext_analy_rev_cit_with_rel_json']
    if pd.isna(row):
        continue

    # Clean text from triple backticks and newline artifacts
    cleaned = re.sub(r"```json|```", "", row).strip()

    # Extract values using regex
    faith_match = faith_re.search(cleaned)
    sound_match = sound_re.search(cleaned)
    imp_match   = imp_re.search(cleaned)

    if faith_match:
        df.at[i, 'adjusted_faithfulness_score'] = int(faith_match.group(1))
    if sound_match:
        df.at[i, 'adjusted_soundness_score'] = int(sound_match.group(1))
    if imp_match:
        df.at[i, 'adjusted_importance_score'] = int(imp_match.group(1))


In [None]:
avg_faith = df['adjusted_faithfulness_score'].mean()
avg_sound = df['adjusted_soundness_score'].mean()
avg_imp = df['adjusted_importance_score'].mean()

print(f"Average Faithfulness: {avg_faith:.2f}")
print(f"Average Soundness:   {avg_sound:.2f}")
print(f"Average Importance:  {avg_imp:.2f}")
