### measuring hallucination

In [None]:
import os
import pandas as pd
from openai import OpenAI

# Initialize the OpenAI client
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Function to perform NLI classification via GPT
def check_nli(premise: str, hypothesis: str) -> str:
    messages = [
        {"role": "system", "content": (
            "You are a natural language inference (NLI) classifier. "
            "Given a premise and hypothesis, respond with exactly one word: "
            "'entailment', 'neutral', or 'contradiction'."
        )},
        {"role": "user", "content": f"Premise: {premise}\nHypothesis: {hypothesis}\nLabel:"}
    ]
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0
    )
    return resp.choices[0].message.content.strip().lower()

# Apply NLI check across the DataFrame
df_rest['nli_label'] = df_rest.apply(
    lambda row: check_nli(row['combined'], row['RAG_generated_fw_from_paper']),
    axis=1
)

# Calculate hallucination rate (contradiction or neutral)
hallucination_mask = df_rest['nli_label'].isin(['neutral', 'contradiction'])
hallucination_count = hallucination_mask.sum()
total = len(df_rest)
hallucination_rate = hallucination_count / total

# Print results
print(f"Total samples: {total}")
print(f"Hallucinations (neutral or contradiction): {hallucination_count}")
print(f"Hallucination rate: {hallucination_rate:.2%}")


### feasability check

In [None]:
from openai import OpenAI
import os
import base64
import time
import pandas as pd

os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the OpenAI streaming function for GPT-4o-mini
def run_critic_openai(prompt: str):
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        stream=True,
        temperature=0
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""
    return summary_text.strip()

# Now your batch‐processing loop:
all_generated_summary = []
start_time = time.time()

In [None]:
# Updated prompt template including paper context
feasibility_prompt_template = (
    "You are an expert reviewer. Below is the content of a research paper followed by a suggestion for future work. "
    "Evaluate whether the future work is executable in the context of the paper's methodology, dataset, or other components. "
    "Respond with exactly one word: 'feasible' or 'not feasible'.\n\n"
    "Paper Content:\n{}\n\n"
    "Future Work Suggestion:\n{}"
)

# Initialize the output column
df_neurips["feasibility_llm_judge_with_ref"] = None

# Process each row
for i in range(len(df_neurips)): # len(df_neurips)
    paper_content = df_neurips.at[i, 'combined']
    future_work = df_neurips.at[i, 'RAG_generated_fw_from_paper']

    # Skip empty or invalid input
    if not future_work or not isinstance(future_work, str):
        df_neurips.at[i, "feasibility_llm_judge"] = "empty"
        continue

    prompt = feasibility_prompt_template.format(paper_content, future_work)

    try:
        response = run_critic_openai(prompt, retries=2)
        label = response.strip().lower()
        if label not in ["feasible", "not feasible"]:
            label = "unknown"
    except Exception as e:
        print(f"[Error] Row {i} - {e}")
        label = "error"

    df_neurips.at[i, "feasibility_llm_judge_with_ref"] = label
    # time.sleep(1)  # avoid rate limits


feasible_count = (df_neurips['feasibility_llm_judge_with_ref'] == 'feasible').sum()
print(f"Number of rows marked as feasible: {feasible_count}")

total_valid = df_neurips['feasibility_llm_judge_with_ref'].isin(['feasible', 'not feasible']).sum()
feasible_rate = feasible_count / total_valid if total_valid else 0
print(f"Feasible rate: {feasible_rate:.2%} ({feasible_count}/{total_valid})")