RAG Settings

In [None]:
!pip3 -q install rank_bm25
!pip3 -q install llama-index-llms-langchain
!pip3 -q install langchain_community
!pip3 -q install llama_index
!pip3 -q install sentence_transformers
!pip3 -q install langchain

import os
import sys
import io
import time
import logging
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from openai import OpenAI

In [None]:
# API key
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# LangChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.schema import Document as LCDocument
from langchain.schema import SystemMessage, HumanMessage
from langchain.document_loaders import CSVLoader, DirectoryLoader, TextLoader, DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.core.schema import Document as LIDoc
from llama_index.core.service_context import ServiceContext
from llama_index.core.service_context_elements.llm_predictor import LLMPredictor
from llama_index.embeddings.langchain import LangchainEmbedding

# Sentence Transformers
from sentence_transformers import SentenceTransformer

# Tokenizer
import tiktoken

# -------------------------------
# Token budget helpers
# -------------------------------
MAX_CONTEXT_TOKENS = 127_000
MODEL_NAME = "gpt-4o-mini"

def truncate_for_context(query: str, passages: list[str],
    max_tokens: int = MAX_CONTEXT_TOKENS,
    model: str = MODEL_NAME,
) -> list[str]:
    enc = tiktoken.encoding_for_model(model)
    q_tokens = enc.encode(query, disallowed_special=())
    budget = max_tokens - len(q_tokens)
    kept, used = [], 0
    for p in passages:
        p_toks = enc.encode(p, disallowed_special=())
        if used + len(p_toks) > budget:
            if budget - used > 0:
                kept.append(enc.decode(p_toks[:(budget - used)]))
            break
        kept.append(p)
        used += len(p_toks)
    return kept

def count_tokens(text: str, model: str = MODEL_NAME) -> int:
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text, disallowed_special=()))

def ensure_passages_within_budget(
    query: str,
    passages: list[str],
    max_tokens: int = MAX_CONTEXT_TOKENS,
    model: str = MODEL_NAME,
) -> list[str]:
    total = count_tokens(query + "\n\n".join(passages), model=model)
    if total <= max_tokens:
        return passages
    print(f"Truncating context ({total} tokens)…")
    return truncate_for_context(query, passages, max_tokens=max_tokens, model=model)

# -------------------------------
# Retriever setup
# -------------------------------
hf_emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def make_retriever_for_docs(docs, k=3):
    # FAISS
    faiss_store = FAISS.from_documents(docs, hf_emb)
    faiss_r = faiss_store.as_retriever(search_kwargs={"k": k})

    # BM25
    bm25_r = BM25Retriever.from_documents(docs)
    bm25_r.k = k

    # Ensemble retriever
    return EnsembleRetriever(
        retrievers=[faiss_r, bm25_r],
        weights=[0.5, 0.5]
    )

# -------------------------------
# QA pipeline
# -------------------------------
chat_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

def generate_limitations(question, retriever):
    qa = RetrievalQA.from_chain_type(
        llm=chat_llm,
        chain_type="stuff",   # simplest: stuff docs into prompt
        retriever=retriever,
        return_source_documents=False
    )
    return qa.run(question).strip()


In [None]:
from langchain.schema import SystemMessage, HumanMessage
from langchain.chat_models import ChatOpenAI
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
import os, sys, logging

# ─── API Key ─────────────────────────────────────────────
os.environ['OPENAI_API_KEY'] = ''

# ─── Logging ─────────────────────────────────────────────
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# ─── Model + Embeddings ──────────────────────────────────
MODEL_NAME         = "gpt-4o-mini"
MAX_CONTEXT_TOKENS = 127_000

chat_llm = ChatOpenAI(model_name=MODEL_NAME, temperature=0)

# SentenceTransformer (manual embeddings if needed)
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# LangChain wrapper for embeddings
hf_emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ─── Helper to extract text from docs ─────────────────────
def get_doc_text(d):
    if hasattr(d, "text"):
        return d.text
    if hasattr(d, "get_content"):
        return d.get_content()
    if hasattr(d, "content"):
        return d.content
    raise AttributeError(f"No text attr on {type(d)}")

# ─── Input sections and reference suffixes ────────────────
main_cols = [
    "df_Abstract",
    "df_Introduction",
    "df_Related_Work",
    "df_Methodology",
    "df_Dataset",
    "df_Conclusion",
    "df_Experiment_and_Results"
]

ref_suffixes = [
    "Introduction",
    "Related_Work",
    "Methodology",
    "Dataset",
    "Conclusion",
    "Experiment_and_Results",
    "Limitation",
    "Extra"
]

# ─── System Prompt ───────────────────────────────────────
system_prompt = """You are a helpful, respectful, and honest assistant
for generating limitations or shortcomings of a research paper.
Generate limitations or shortcomings for the following passages
from the scientific paper.
"""

# ─── Unified helper for LLM calls ─────────────────────────
def run_critic(prompt: str, *, system_prompt: str | None = None) -> str:
    """
    Wraps ChatOpenAI to (optionally) send a system prompt + user prompt,
    and returns the assistant's reply as a stripped string.
    """
    messages: list[SystemMessage | HumanMessage] = []

    if system_prompt:
        messages.append(SystemMessage(content=system_prompt))

    messages.append(HumanMessage(content=prompt))

    response = chat_llm.invoke(messages)
    return response.content.strip()


In [None]:
Rel_Prompt = '''You are a Relevance Evaluation Agent, an expert in assessing the relevance of retrieved text chunks from a vector
database against an input query for the task of generating limitations of scientific articles. Your task is to evaluate the relevance
of 10 retrieved text chunks against an input query, which consists of a scientific paper (including sections: Abstract, Introduction,
Methodology, Related Work, Experiment and Results, Limitations, and Future Work) and its rewritten version. For each chunk, assign a
relevance score from 1 (least relevant) to 10 (most relevant) based on semantic and contextual alignment with the input query, and
provide a brief justification for the score.

Input:

Input Query: [The full text of the original scientific paper and its rewritten version]
Retrieved Text Chunks: A list of 10 text chunks, each with a unique identifier and content, formatted as:

Chunk 1: [chunk_id_1]: [retrieved_text_1]
Chunk 2: [chunk_id_2]: [retrieved_text_2]
Chunk 3: [chunk_id_3]: [retrieved_text_3]
Chunk 4: [chunk_id_4]: [retrieved_text_4]
Chunk 5: [chunk_id_5]: [retrieved_text_5]
Chunk 6: [chunk_id_6]: [retrieved_text_6]
Chunk 7: [chunk_id_7]: [retrieved_text_7]
Chunk 8: [chunk_id_8]: [retrieved_text_8]
Chunk 9: [chunk_id_9]: [retrieved_text_9]
Chunk 10: [chunk_id_10]: [retrieved_text_10]

Instructions:

Evaluate Relevance: For each of the 10 retrieved text chunks, assess its relevance to the input query based on semantic and contextual
alignment with the original and rewritten scientific paper. Consider how closely the chunk matches key concepts, arguments, or details
in the query.

Assign Relevance Score:
High Scores (8–10): The chunk has strong semantic and contextual alignment with the input query, closely matching key concepts or details.
Prioritize chunks containing limitations (e.g., study constraints, challenges) or methodological summaries (e.g., study design, methods),
boosting their score by 1–2 points if they align well with the query.

Medium Scores (4–7): The chunk has moderate semantic and contextual alignment, containing relevant but less central content (e.g., results,
general context, or partial methodological details).

Low Scores (1–3): The chunk has minimal or no semantic and contextual alignment, such as unrelated content, generic statements, or
off-topic information.

Prioritize Limitations and Methodology: Chunks explicitly discussing limitations (e.g., sample size, data constraints, scope issues) or
methodological summaries (e.g., study design, experimental setup) are highly relevant. Boost their score by 1–2 points if they align
well with the input query, compared to other relevant content.

Provide Justification: For each chunk, include a brief justification explaining the assigned score, referencing the chunk’s semantic and
contextual alignment with the input query and noting whether it contains limitations or methodological summaries.

Do Not Modify Text: Evaluate each chunk as provided, without modifying or paraphrasing the retrieved text.

Handle Irrelevant Chunks: If a chunk is unrelated to the input query or lacks meaningful content, assign a score of 1 with an appropriate
justification.

Workflow:
Plan: Review the input query (original and rewritten paper) and the 10 retrieved text chunks to understand their content and context.

Reasoning:
Step 1: For each chunk, identify its main topic or content (e.g., limitations, methodology, results, background).
Step 2: Compare the chunk’s content to the input query, assessing semantic and contextual alignment with the paper’s sections
(e.g., Limitations, Methodology).
Step 3: Assign a relevance score (1–10) based on alignment, prioritizing limitations and methodological summaries.
Step 4: Write a brief justification for the score, explaining the chunk’s relevance and any priority given to limitations or methodology.
Step 5: Verify the score and justification are accurate and consistent with the chunk’s content and the input query.

Analyze: Use text analysis tools to confirm semantic alignment (e.g., keyword matching for “limitation,” “constraint,” “methodology,” “sample size”) and assess relevance to the input query.
Reflect: Ensure scores and justifications are fair, consistent, and reflect the chunk’s alignment with the query, re-evaluating any ambiguous cases.
Continue: Iterate until all 10 chunks are evaluated with scores and justifications.

Tool Use:
Use text analysis tools to identify limitation-related or methodology-related keywords (e.g., “limited,” “constraint,” “sample size,” “methodology”) and assess semantic similarity between chunks and the input query.
Use semantic similarity checks to confirm alignment between the chunk and the query’s key concepts.

Chain of Thoughts: Document the reasoning process internally for each chunk. For example:
“This chunk mentions a small sample size, a limitation, and aligns closely with the query’s focus, so it receives a high score (9).”
“This chunk discusses results without addressing limitations or methodology, so it receives a medium score (6).”
“This chunk is generic and unrelated to the query’s specific content, so it receives a low score (1).”

Output Format: The output must be in strict JSON format, containing an array of 10 objects, one for each retrieved text chunk, with the
following structure for each object:
"Chunk_number": [Chunk number, e.g., "Chunk 1", "Chunk 2", ..., "Chunk 10"]
"relevance_score": [Integer from 1 to 10]
"justification": [Brief explanation of the score, referencing the chunk’s semantic and contextual alignment with the query and any emphasis on limitations or methodological summaries]

Example: Input: Input Query: [Full text of the original scientific paper and its rewritten version] Retrieved Text Chunks:

Chunk 1: chunk_001: The study was limited by a small sample size, which may affect generalizability.
Chunk 2: chunk_002: The experiment used a randomized controlled trial design to test the algorithm.
Chunk 3: chunk_003: The experiment achieved a 20% improvement in processing speed.
...
Chunk 10: chunk_010: Data processing is a key challenge in modern research.

Output: [ { "Chunk_number": "Chunk 1", "relevance_score": 9, "justification": "The chunk has strong semantic and contextual alignment with
the input query, explicitly discussing a limitation (small sample size), which is a high-priority element for limitation generation." },
{ "Chunk_number": "Chunk 2", "relevance_score": 8, "justification": "The chunk aligns well with the input query by describing the
methodological approach, a high-priority element, though it is slightly less central than limitations-related content." },
{ "Chunk_number": "Chunk 3", "relevance_score": 6, "justification": "The chunk has moderate semantic and contextual alignment,
discussing experimental results, but lacks focus on limitations or methodology, resulting in a mid-range score." },
...
{ "Chunk_number": "Chunk 10", "relevance_score": 3, "justification": "The chunk provides generic background information with minimal
semantic and contextual alignment to the input query’s specific concepts or arguments." } ] '''

### measure relevance score with each chunk with input

In [None]:
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
import tiktoken

MODEL_NAME = "gpt-4o-mini"
MAX_CONTEXT_TOKENS = 127_000
hf_emb = HuggingFaceEmbeddings()
enc = tiktoken.encoding_for_model(MODEL_NAME)

# Initialize column
df1["cited_by_top_20_raw"] = None
df1["cited_by_top_20_texts"] = None
df1["cited_by_top_20_meta"] = None
df1["retrieved_text_llm_asses"] = None

# Iterate through df1
for i, row in df1.iterrows():
    print("index",i)
    cited_list = row.get("cited_by_full_text", [])
    all_docs = []

    for j, cited_dict in enumerate(cited_list):
        abstract = cited_dict.get("abstractText", "")
        sections = cited_dict.get("sections", [])
        row_num = cited_dict.get("row_number", "")
        file_name = cited_dict.get("file_name", "")

        if isinstance(abstract, str) and abstract.strip():
            all_docs.append(Document(
                page_content="Abstract: " + abstract.strip(),
                metadata={"row_number": row_num, "file_name": file_name, "position": j}
            ))

        for sec in sections:
            if isinstance(sec, dict):
                heading = sec.get("heading", "").strip()
                text = sec.get("text", "").strip()
                if text:
                    combined = f"{heading}: {text}" if heading else text
                    all_docs.append(Document(
                        page_content=combined,
                        metadata={"row_number": row_num, "file_name": file_name, "position": j}
                    ))

    if not all_docs:
        continue

    # Split into chunks
    splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=32)
    chunked_docs = splitter.split_documents(all_docs)
    print(f"Total number of chunks for row {i}: {len(chunked_docs)}")

    # Create retrievers
    faiss_store = FAISS.from_documents(chunked_docs, hf_emb)
    faiss_r = faiss_store.as_retriever(search_kwargs={"k": 20})
    bm25_r = BM25Retriever.from_documents(chunked_docs)
    bm25_r.k = 20

    # Ensemble Retriever
    ensemble = EnsembleRetriever(
        retrievers=[faiss_r, bm25_r],
        weights=[0.5, 0.5]
    )

    # Use row-specific query
    example_query = row.get("response_string_neurips", "")
    input_query = (
        f"Scientific paper:\n{row['response_string_neurips']}\n\n"
        f"Rewritten version of scientific paper:\n{row['Input_Query_rewrite']}"
    )
    if not input_query.strip():
        continue

    # Retrieve top 20 and store
    top20 = ensemble.get_relevant_documents(input_query)
    top20 = top20[:20]

    # Store top20 as list of strings in the column
    df1.at[i, "cited_by_top_20_raw"] = top20  # stores full Document objects (content + metadata)
    df1.at[i, "cited_by_top_20_texts"] = [doc.page_content for doc in top20]  # just the texts
    df1.at[i, "cited_by_top_20_meta"] = [doc.metadata for doc in top20]  # just the metadata

    # ---------- 6) Batch LLM Scoring ---------------
    all_llm_scores = []

    # Rel_Prompt = "You are a helpful assistant tasked with evaluating text relevance."  # or load externally

    prefix = (
        f"{Rel_Prompt}\n\n"
        f"Input Query:\n{example_query}\n\n"
        "Here are up to 10 retrieved text chunks:\n"
    )
    prefix_len = count_tokens(prefix, model=MODEL_NAME)

    question = (
        "\\nOn a scale of 1–10, how relevant is each chunk to the above Input Query? "
        "Respond with JSON array with Chunk Number, Score, and Justification for each chunk."
    )
    question_len = count_tokens(question, model=MODEL_NAME)

    for batch_start in (0, 10):
        batch_docs = top20[batch_start:batch_start+10]
        if not batch_docs:
            continue  # skip empty batches
        batch_texts = [d.page_content for d in batch_docs]

        # truncate to token budget
        available = MAX_CONTEXT_TOKENS - prefix_len - question_len
        kept, used = [], 0
        for p in batch_texts:
            toks = enc.encode(p, disallowed_special=())
            if used + len(toks) > available:
                break
            kept.append(p)
            used += len(toks)

        chunks_list = "\n\n".join(
            f"Chunk {batch_start+idx+1}: {text}" for idx, text in enumerate(kept)
        )
        prompt = prefix + chunks_list + question

        # Call the LLM to assess
        raw = run_critic(prompt)
        all_llm_scores.append(raw)  # you can also parse JSON if needed

    # Save LLM assessments
    df1.at[i, "retrieved_text_llm_asses"] = all_llm_scores

    # Optional debug print
    if i == 0 or i == 20:
        print(f"all_llm_scores for row {i}:", all_llm_scores)

df1.to_csv("df.csv",index=False)

In [None]:
import pandas as pd
import ast
import json
import re

# Placeholder: simulate loading df
# df = pd.read_csv("your_path.csv")

# Prepare a new column to store high-score chunks
df1["top_chunks_texts"] = None

def extract_top_chunks(row):
    try:
        raw = row["retrieved_text_llm_asses"]
        if isinstance(raw, str):
            # Convert string to list
            raw_list = ast.literal_eval(raw)
        elif isinstance(raw, list):
            raw_list = raw
        else:
            return None

        all_chunks = []
        for entry in raw_list:
            # Extract the JSON block
            match = re.search(r"\[.*\]", entry, re.DOTALL)
            if not match:
                continue
            try:
                chunk_json = json.loads(match.group())
                for item in chunk_json:
                    score = item.get("relevance_score", 0)
                    chunk_idx = int(item.get("Chunk_number", "Chunk 0").split()[-1]) - 1
                    if score >= 7:
                        # Safely get the chunk from cited_by_top_20_texts
                        text_list = row.get("cited_by_top_20_texts", [])
                        if isinstance(text_list, list) and 0 <= chunk_idx < len(text_list):
                            all_chunks.append(text_list[chunk_idx])
            except Exception as e:
                continue
        return all_chunks if all_chunks else None
    except Exception as e:
        return None

# Apply to all rows
df1["top_chunks_texts"] = df1.apply(extract_top_chunks, axis=1)


In [None]:
import ast
import pandas as pd
import numpy as np

# Only apply ast.literal_eval to non-null strings
df1['top_chunks_texts'] = df1['top_chunks_texts'].apply(
    lambda x: ast.literal_eval(x) if pd.notnull(x) else np.nan
)

In [None]:
import ast
import pandas as pd

df['retrieved_text_llm_asses'] = (
    df['retrieved_text_llm_asses']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)
df['top20_docs'] = (
    df['top20_docs']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)

In [None]:
# take chunk where relevance score 8 or more

def pick_high_relevance(row, threshold=8):
    docs  = row['top20_docs']
    asses = row['retrieved_text_llm_asses']
    # find all indices where relevance_score ≥ threshold
    idxs = [i for i, d in enumerate(asses)
            if isinstance(d, dict) and d.get('relevance_score', 0) >= threshold]
    # pull the same‐indexed items from top20_docs (guarding against bad indices)
    return [docs[i] for i in idxs if i < len(docs)]

# create a new column with the selected docs
df['relevance_8_cited_in'] = df.apply(pick_high_relevance, axis=1)


In [None]:
import ast
import pandas as pd

df['cited_by_top_20_texts'] = (
    df['cited_by_top_20_texts']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)

In [None]:
df['retrieved_text_llm_asses_cited_by'] = (
    df['retrieved_text_llm_asses_cited_by']
      .fillna('[]')               # turn NaN → "[]"
      .apply(ast.literal_eval)    # now safe to eval
)

In [None]:
import json
import re

def extract_chunk_dicts(cell):
    """
    cell is expected to be a list of strings, each string containing a
    ```json ... ``` block holding a JSON array of chunk‐dicts.
    This returns a flat list of all dicts.
    """
    out = []
    for s in cell or []:
        # 1) remove the ```json fences
        s_clean = re.sub(r'^```json\s*', '', s.strip())
        s_clean = re.sub(r'```$',      '', s_clean.strip())

        # 2) parse the JSON
        try:
            data = json.loads(s_clean)
        except json.JSONDecodeError:
            continue

        # 3) if it’s a list of dicts, extend; otherwise skip
        if isinstance(data, list):
            out.extend(d for d in data if isinstance(d, dict))
    return out

# apply to your DataFrame
df['retrieved_text_llm_asses_cited_by_upd'] = df['retrieved_text_llm_asses_cited_by'].apply(extract_chunk_dicts)


In [None]:
# take chunk where relevance score 8 or more

def pick_high_relevance(row, threshold=8):
    docs  = row['cited_by_top_20_texts']
    asses = row['retrieved_text_llm_asses_cited_by_upd']
    # find all indices where relevance_score ≥ threshold
    idxs = [i for i, d in enumerate(asses)
            if isinstance(d, dict) and d.get('relevance_score', 0) >= threshold]
    # pull the same‐indexed items from top20_docs (guarding against bad indices)
    return [docs[i] for i in idxs if i < len(docs)]

# create a new column with the selected docs
df['relevance_8_cited_by'] = df.apply(pick_high_relevance, axis=1)


In [None]:
Citation_agent = '''You are an expert scientific research assistant tasked with inferring potential limitations for an unspecified
current scientific article based solely on its cited papers.
You are given information from multiple cited papers, which are assumed to be referenced by the current article.
Your goal is to analyze these cited works and identify possible limitations that the current paper may have, by
comparing its presumed scope, methods, or results against the cited literature.
Because the input paper itself is not provided, you must reason from the cited papers alone, identifying what
gaps, stronger methods, broader coverage, or alternative results the cited works might expose in the hypothetical
current paper that cites them.

Objective:

Generate a list of scientifically grounded limitations that the current article might have, assuming it builds upon or is informed by the provided cited papers.

Each limitation should:

Be concise

Reference the relevant cited paper(s) by title

Clearly explain how the cited paper exposes a potential limitation

Be plausible and insightful based on common scientific reasoning

Workflow:
Plan:
Identify key insights, strengths, and scopes of the cited papers that could set a high bar or reveal blind spots
in a hypothetical citing article.

Reasoning: Let's think step by step to infer limitations:
Review each cited paper to extract its methodology, findings, and scope.
Ask: If a paper cited this work but did not adopt or address its insights, what limitation might arise?
Identify where the cited paper offers better methodology, broader scope, or contradicting findings.
Formulate each limitation as a plausible shortcoming of a hypothetical article that builds on—but possibly
underutilizes—these cited works.

Justify each limitation based on specific attributes of the cited paper (e.g., "more comprehensive dataset",
"stronger evaluation metric", etc.)

Analyze:
Develop a set of inferred limitations, each tied to specific cited paper(s) and grounded in logical comparison.

Reflect:
Ensure coverage of all relevant cited papers and validate that each limitation is scientifically plausible in
context.

Output Format:
Bullet points listing each limitation.
For each: Description, explanation, and reference to the cited paper(s) in the format Paper Title.

Tool Use (if applicable):

Use citation lookup tools or document content to extract accurate summaries.
Do not assume details about the input paper—focus only on drawing limitations based on differences, omissions,
or underuse of the cited works.

Chain of Thoughts:
During the Reasoning step, document the thought process explicitly. For example:
"I selected [Paper X] because it uses a more robust method than the current article."
"The current article's simpler method may limit accuracy compared to [Paper X]."
"I reviewed all cited papers to ensure no relevant gaps were missed."
This narrative ensures transparency and justifies each identified limitation.

<|eot_id|><|start_header_id|>user<|end_header_id|>

Cited Papers Information:
{cited_papers}

Please identify limitations that would be relevant for researchers who might cite this paper in future work.
Consider what limitations future authors might mention when discussing this paper's contribution to the field,
based on the cited papers context.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''


### Generate limitations using cited in + cited by papers

In [None]:
# Define the column names to concatenate
cols_to_concat = [
    "neurips_Abstract",
    "neurips_Introduction",
    "neurips_Related_Work",
    "neurips_Methodology",
    "neurips_Dataset",
    "neurips_Conclusion",
    "neurips_Experiment_and_Results",
    "neurips_Extra"
]

# Create a new column 'response_string_neurips' with labeled concatenation
def concat_with_labels(row):
    parts = []
    for col in cols_to_concat:
        if isinstance(row.get(col), str) and row[col].strip():
            label = col.replace("neurips_", "").replace("_", " ")
            parts.append(f"{label}: {row[col].strip()}")
    return "\n\n".join(parts)

df["response_string"] = df.apply(concat_with_labels, axis=1)

In [None]:
import re
import pandas as pd
import tiktoken

# Tokenization setup
encoding   = tiktoken.encoding_for_model("gpt-4o-mini")
max_tokens = 128000

def truncate_to_max_tokens(text: str, max_length: int) -> str:
    tokens = encoding.encode(text)
    return encoding.decode(tokens[:max_length]) if len(tokens) > max_length else text

# Make sure the output column exists
df['citation_agent_in_by_8'] = ''

# Process each row
for i in range(len(df)): # len(df)
    print(f"Processing row {i}...")
    row = df.iloc[i]

    # 1) Collect all items from relevance_8_cited_in
    cited_in_list = row.get('relevance_8_cited_in', []) or []
    cited_in_texts = []
    for itm in cited_in_list:
        # if it's a dict with 'text' key, grab that, otherwise str(itm)
        if isinstance(itm, dict) and 'text' in itm:
            cited_in_texts.append(itm['text'])
        else:
            cited_in_texts.append(str(itm))

    # 2) Collect all items from relevance_8_cited_by
    cited_by_list = row.get('relevance_8_cited_by', []) or []
    cited_by_texts = []
    for itm in cited_by_list:
        if isinstance(itm, dict) and 'text' in itm:
            cited_by_texts.append(itm['text'])
        else:
            cited_by_texts.append(str(itm))

    # 3) Build the combined prompt section
    cited_in_block = "\n".join(cited_in_texts)
    cited_by_block = "\n".join(cited_by_texts)

    combined_cited_input = (
        "Referenced papers:\n" + cited_in_block +
        "\n\nPapers who cited this paper:\n" + cited_by_block
    )

    input_paper = df['response_string'][i]
    prompt = Citation_agent + (
        "You are an assistant tasked to generate limitations or shortcomings "
        "in a scientific article. Below is the input paper:\n"
        f"{input_paper}\n\n"
        " Below is the relevant text from both the papers "
        "that this article cites and those that cite it.\n\n"
        f"{combined_cited_input}\n\n"
        "Please generate limitations based on this information."
    )

    # 5) Truncate and call LLM
    truncated = truncate_to_max_tokens(prompt, max_tokens)
    try:
        llm_summary = azure_run_critic(truncated)
    except Exception as e:
        print(f"Error at row {i}: {e}")
        llm_summary = "ERROR"

    df.at[i, "citation_agent_in_by_8"] = llm_summary
