loading and processing data

In [None]:
# Re-import everything after code execution environment reset
import os
import json
import base64
import re
import pandas as pd
from tqdm import tqdm

# Define paths
json_dir = "NeurIPS21_22_Image_data"
image_dir = os.path.join(json_dir, "NeurIPS21_22")

# Function to encode image to base64
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Function to extract figure ID from caption
def extract_fig_id_from_caption(caption):
    match = re.match(r'^(Figure|Table)\s+\d+', caption)
    return match.group(0) if match else None

# Create new columns
df["caption"] = [[] for _ in range(len(df))]
df["fig_id"] = [[] for _ in range(len(df))]
df["image_path"] = [[] for _ in range(len(df))]
df["image_base64"] = [[] for _ in range(len(df))]

# Iterate through each row in df
for idx, row in tqdm(df.iterrows(), total=len(df)): # len(df)
    xml_file = row.get("xml_file_name")
    if not isinstance(xml_file, str) or not xml_file.endswith(".tei.xml"):
        print(f"Row {idx} skipped due to missing or invalid xml_file: {xml_file}")
        continue

    file_num = xml_file.replace("neurips.", "").replace(".tei.xml", "")
    print('file_num',file_num)
    json_path = os.path.join(json_dir, f"NeurIPS21_22neurips.{file_num}.json")
    print('json_path',json_path)
    if not os.path.exists(json_path):
        continue

    try:
        with open(json_path, "r") as f:
            data = json.load(f)
    except:
        continue

    for item in data:
        caption = item.get("caption", "")
        fig_id = extract_fig_id_from_caption(caption)
        if not fig_id:
            continue

        image_filename_pattern = f"fig_neurips.{file_num}-{fig_id.replace(' ', '')}-*.png"
        matching_files = [
            f for f in os.listdir(image_dir)
            if re.match(image_filename_pattern.replace("*", r".+"), f)
        ]

        if not matching_files:
            continue

        image_path = os.path.join(image_dir, matching_files[0])
        try:
            image_b64 = encode_image_to_base64(image_path)
        except:
            continue

        df.at[idx, "caption"].append(caption)
        df.at[idx, "fig_id"].append(fig_id)
        df.at[idx, "image_path"].append(image_path)
        df.at[idx, "image_base64"].append(image_b64)

df.to_csv("df_neurips_limitation_and_OR_with_cited_data_image_lim.csv",index=False)

In [None]:
df['response_string_final_neurips'] = df.apply(lambda row: f"""Abstract: {row['neurips_Abstract']}
Introduction: {row['neurips_Introduction']}
Related Work: {row['neurips_Related_Work']}
Methodology: {row['neurips_Methodology']}
Dataset: {row['neurips_Dataset']}
Conclusion: {row['neurips_Conclusion']}
Experiment and Results: {row['neurips_Experiment_and_Results']}
Other1: {row['neurips_Extra']}
""", axis=1)

In [None]:
import os
import base64
import re
import json
import time
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import CharacterTextSplitter
from rank_bm25 import BM25Okapi
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever
from langchain.embeddings import HuggingFaceEmbeddings
from openai import AzureOpenAI
from openai import AzureOpenAI, RateLimitError
import re
from tqdm import tqdm
from openai import AzureOpenAI, RateLimitError
# ─── Azure OpenAI client setup ───────────────────────────────────────────
endpoint = os.getenv("ENDPOINT_URL", "https://scientific-lim-resource.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2025-01-01-preview")
deployment = os.getenv("DEPLOYMENT_NAME", "gpt-4o-mini")
subscription_key = os.getenv("AZURE_OPENAI_API_KEY", "")


client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version="2025-01-01-preview",
)


# Define figure-specific prompt
def get_figure_prompt(caption: str, context: str) -> str:
    return f"""
You are a highly skilled assistant for scientific document analysis. Your task is to identify relevant **figure** descriptions
**from the provided paragraph**, given a figure caption.

- Do not generate or paraphrase content.
- Do not make assumptions.
- Only extract the most relevant span or sentence(s) from the paragraph that describe the given figure caption.
- If no description is found, say "None".

Figure Caption: {caption}

Paragraph:
{context}
""".strip()

# Define table-specific prompt
def get_table_prompt(caption: str, context: str) -> str:
    return f"""
You are a highly skilled assistant for scientific document analysis. Your task is to identify relevant **table** descriptions
**from the provided paragraph**, given a table caption.

- Do not generate or paraphrase content.
- Do not make assumptions.
- Only extract the most relevant span or sentence(s) from the paragraph that describe the given table caption.
- If no description is found, say "None".

Table Caption: {caption}

Paragraph:
{context}
""".strip()

# Function to call LLM with the appropriate prompt
def find_relevant_figure_description(caption: str, context: str, retries: int = 1):
    fig_label_match = re.match(r"^(Figure|Table)\s*\d+", caption)
    fig_label = fig_label_match.group(0) if fig_label_match else ""
    is_table = "table" in fig_label.lower()

    prompt = get_table_prompt(caption, context) if is_table else get_figure_prompt(caption, context)
    messages = [{"role": "user", "content": prompt}]

    for attempt in range(retries + 1):
        try:
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                temperature=0
            )
            return fig_label + ": " + resp.choices[0].message.content.strip() if fig_label else resp.choices[0].message.content.strip()
        except RateLimitError:
            if attempt < retries:
                time.sleep(60)
            else:
                return "Error: RateLimit"
        except Exception as e:
            return f"Error: {str(e)}"

# Process the DataFrame (currently limited to the first row for testing)
# df["fig_desc"] = [[] for _ in range(len(df))]

# Apply extraction
# Ensure the column exists
for idx in tqdm(range(120, len(df)), total=len(df) - 120):
    # Skip rows that already have filled fig_desc
    if isinstance(df.at[idx, "fig_desc"], list) and df.at[idx, "fig_desc"]:
        continue  # already processed
    row = df.iloc[idx]
    captions = row.get("caption", [])
    context = row.get("response_string_final_neurips", "")

    if isinstance(captions, str):
        try:
            captions = eval(captions)
        except:
            captions = []

    descriptions = []
    for caption in captions:
        if caption and isinstance(context, str):
            desc = find_relevant_figure_description(caption, context)
            descriptions.append(desc)
        else:
            descriptions.append("None")

    df.at[idx, "fig_desc"] = descriptions

    if idx % 5 == 0:
        df.to_csv("df_neurips_limitation_and_OR_with_cited_data_image_lim.csv",index=False)
        print(f"Saved interim CSV at row {idx}")

df.to_csv("df_neurips_limitation_and_OR_with_cited_data_image_lim.csv",index=False)

In [None]:
PROMPT_TEMPLATE = '''
You are a critical assistant specialized in analyzing scientific visualizations.

Your task is to examine the **figure (image)**, along with its **caption** and **textual description**, and identify potential **limitations, weaknesses, or concerns** relevant to scientific research quality.

Please evaluate the figure based on the following key aspects:

1. 🔬 **Methodological Flaws**:
   - Missing baselines or controls?
   - Poor experimental design or overfitting?
   - Too narrow in scope for generalization?

2. 📉 **Reproducibility or Generalization Issues**:
   - Missing error bars/confidence intervals?
   - Non-representative samples or cherry-picking?
   - No mention of repeated trials or statistical support?

3. 🧠 **Interpretability & Visual Clarity**:
   - Are axes, labels, legends clear?
   - Is the visual layout cluttered or confusing?
   - Is the figure colorblind-safe and easy to interpret?

4. ⚠️ **Bias & Misrepresentation**:
   - Is the axis scaled to exaggerate results?
   - Are negative/failure cases omitted?
   - Does the figure mislead through design?

5. 🧩 **Design & Consistency**:
   - Caption and image alignment?
   - Consistency with other figures?
   - Proper reference to figure in main text?

---

### Your Input:

- **Figure Caption**:
{caption}

- **Textual Description** (from the main article):
{description}

- **Attached Image**:
(base64 image supplied separately)

---

### Instructions:

Please return concise bullet points** identifying actual limitations of the figure. Be factual and based on the image, caption, and description only. If no issue is detected, respond:
**“No major issues detected.”**

'''

In [None]:
import os
import base64
import time
import pandas as pd
from openai import AzureOpenAI, RateLimitError

# ─── Azure OpenAI Setup ─────────────────────────────────────────────
endpoint = os.getenv("ENDPOINT_URL", "https://scientific-lim-resource.openai.azure.com")
deployment = os.getenv("DEPLOYMENT_NAME", "gpt-4o")
subscription_key = os.getenv("AZURE_OPENAI_API_KEY", "")

client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version="2025-01-01-preview",
)

# ─── Run LLM Call ────────────────────────────────────────────────────
def azure_run_critic_with_image(prompt: str, image_base64: str, retries: int = 1):
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {
                "url": f"data:image/png;base64,{image_base64}"
            }},
        ]
    }]

    for attempt in range(retries + 1):
        try:
            resp = client.chat.completions.create(
                model="gpt-4o",
                messages=messages,
                temperature=0,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stream=False
            )
            return resp.choices[0].message.content.strip()
        except RateLimitError:
            if attempt < retries:
                print("Rate limit hit; sleeping 60s then retrying…")
                time.sleep(60)
            else:
                raise
        except Exception as e:
            return f"Error: {str(e)}"

# ─── Process DataFrame ──────────────────────────────────────────────
df['image_limitations'] = [[] for _ in range(len(df))]

for idx, row in df.iterrows():  # or use df.iloc[:N] for testing subset
    print("Idx is",idx)
    captions = row.get("caption", [])
    image_b64s = row.get("image_base64", [])
    descriptions = row.get("fig_desc", [])

    limitations = []

    for cap, desc, img_b64 in zip(captions, descriptions, image_b64s):
        if not (cap and desc and img_b64):
            limitations.append("None or missing data")
            continue

        # Build the prompt using both caption and description
        prompt = PROMPT_TEMPLATE.format(caption=cap, description=desc)

        try:
            output = azure_run_critic_with_image(prompt, img_b64)
        except Exception as e:
            output = f"Error: {str(e)}"

        limitations.append(output)

    # Update the DataFrame
    df.at[idx, 'image_limitations'] = limitations

    # Optionally save after every 5 rows
    if idx % 5 == 0:
        df.to_csv("df_with_image_limitations.csv", index=False)
        print(f"Saved at row {idx}")

# Final save
df.to_csv("df_with_image_limitations.csv", index=False)
