### Performance measurement

In [None]:
prompt = '''
Instructions: You are provided with two texts for each pair: one is Author-Mentioned Future Work and another is
LLM-Generated Future Work. Please read both texts carefully. After reviewing each text, assign a score from
1 to 5 for each criterion outlined below. The score should reflect how well the LLM-Generated Future Work compares to the
Author-Mentioned Future Work,
where 1 represents poor quality and 5 represents excellent quality that closely matches or even surpasses the Author-Mentioned Future Work
in some aspects.

Author-Mentioned Future Work:
<Author-Mentioned Future Work>

LLM-Generated Future Work:
<LLM-Generated Future Work>

Scoring Criteria:
Coherence and Logic:

5: The text is exceptionally coherent; the ideas flow logically and are well connected.

3: The text is coherent but may have occasional lapses in logic or flow.

1: The text is disjointed or frequently illogical.

Relevance and Accuracy:

5: The text is completely relevant to the topic and accurate in all presented facts.

3: The text is generally relevant with minor factual errors or slight deviations from the topic.

1: The text often strays off topic or includes multiple factual inaccuracies.

Readability and Style:

5: The text is engaging, well-written, and stylistically consistent with the Author-Mentioned Future Work
3: The text is readable but may lack flair or have minor stylistic inconsistencies.
1: The text is difficult to read or stylistically poor.

Grammatical Correctness:

5: The text is free from grammatical errors.
3: The text has occasional grammatical errors that do not impede understanding.
1: The text has frequent grammatical errors that hinder comprehension.

Overall Impression:
5: The text is of a quality that you would expect from a professional writer.
3: The text is acceptable but would benefit from further editing.
1: The text is of a quality that needs significant revision to be usable.

Task: For each text pair:
Rate the LLM-Generated Future Work on each criterion and provide a final overall score out of 5.
Provide a justification for each criterion score, highlighting strengths and weaknesses observed in the LLM-Generated Future Work
relative to the Author-Mentioned Future Work.
Present the scores and justifications in JSON format, structured as follows:

{ "Coherence and Logic": { "score": , "justification": "" }, "Relevance and Accuracy": { "score": , "justification": "" },
"Readability and Style": { "score": , "justification": "" }, "Grammatical Correctness": { "score": , "justification": "" },
"Overall Impression": { "score": , "justification": "" } }

'''

In [None]:
import os
from openai import OpenAI
import time

os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)
# Choose the encoding for your model:
enc = tiktoken.encoding_for_model("gpt-4o-mini")
MAX_CTX = 128_000  # model’s max context length in tokens

def truncate_to_limit(text: str, max_tokens: int) -> str:
    tokens = enc.encode(text)
    if len(tokens) > max_tokens:
        # slice to max_tokens and decode back to string
        tokens = tokens[:max_tokens]
        return enc.decode(tokens)
    return text

# … your existing setup …

generated_summary = []

for i in range(len(df_rest)): # len(df_rest)
    # build the raw prompt body
    # author_str    = "\n".join(df_rest.at[i, 'LLM_extracted_future_work'])
    # generated_str = "\n".join(df_rest.at[i, 'RAG_generated_fw_from_paper'])

    raw_body = (
        prompt
        + "Author-Mentioned Future Work:\n" + df_rest['LLM_extracted_future_work'][i]
        + "\n\nLLM-Generated Future Work:\n" + df_rest['RAG_generated_fw_from_paper'][i]
    )

    # truncate if necessary
    body = truncate_to_limit(raw_body, MAX_CTX)

    # now send the request with the (possibly truncated) body
    summary_text = ""
    stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": body}],
        stream=True,
        temperature=0.1,
    )
    for chunk in stream:
        summary_text += chunk.choices[0].delta.content or ""

    generated_summary.append([summary_text])


In [None]:
import re, json
import pandas as pd
# import ace_tools as tools

# 1) Flatten your list-of-lists
flat_snippets = [item[0] for item in generated_summary]

records = []
for snippet in flat_snippets:
    # 2) Isolate the first “{” through the last “}”
    start = snippet.find('{')
    end   = snippet.rfind('}')
    if start == -1 or end == -1:
        continue
    json_str = snippet[start:end+1]

    # 3) Remove any trailing commas before } or ]
    json_str = re.sub(r',\s*([}\]])', r'\1', json_str)

    # 4) Load it
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as e:
        print("Failed to parse snippet:", e)
        print(json_str)
        continue

    # 5) Flatten into a single dict
    flat = {}
    for metric, info in data.items():
        key = metric.replace(" ", "_")
        flat[f"{key}_score"]         = info.get("score")
        flat[f"{key}_justification"] = info.get("justification")
    records.append(flat)

# 6) Build your DataFrame
df_gpt_judge21 = pd.DataFrame(records)

# 7) Display
# tools.display_dataframe_to_user("Flattened Judge Metrics", df_gpt_judge)


In [None]:
import pandas as pd

# List of score columns
score_cols = [
    'Coherence_and_Logic_score',
    'Relevance_and_Accuracy_score',
    'Readability_and_Style_score',
    'Grammatical_Correctness_score',
    'Overall_Impression_score'
]

# Print the average of each column
for col in score_cols:
    avg = df_gpt_judge21[col].mean()
    print(avg)


### Self Feedback

In [None]:
futurw_work_generation_prompt = '''
I want to generate future work directions for my research paper based on its entire content (all sections, including abstract, introduction,
background, methodology, results, discussion, etc.). Please analyze the paper and propose substantial, long-term research goals
that extend the current work in a meaningful way, advancing the field or addressing significant open challenges. Ensure the suggested
future work directions are ambitious, grounded in the paper’s content, and avoid trivial or short-term tasks (e.g., minor experiments,
parameter tuning, or small-scale tests). Each direction should be clearly linked to specific aspects of the paper (e.g., limitations,
findings, or discussed challenges) and propose innovative, impactful research objectives. If no suitable long-term future work can
be derived, clearly state: "No long-term future work directions could be derived from the paper." Provide the generated future work
directions in a concise, bulleted list, with each direction accompanied by a brief explanation of how it connects to the paper’s content.

Input Text (Paper Content): [Insert the full text or relevant sections of the paper here]

Output Format: Future Work Directions (Long-Term Goals)

[Future work direction]: [Brief explanation of how this direction connects to the paper’s content and why it is a substantial,
long-term goal.]

[Additional future work directions and explanations, if applicable.]

OR

No long-term future work directions could be derived from the paper.

'''

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
from openai import OpenAI
import tiktoken
import unicodedata

# ── Setup & Helpers ────────────────────────────────────────────────────────────
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

enc     = tiktoken.encoding_for_model("gpt-4o-mini")
MAX_CTX = 128_000

def truncate_to_limit(text: str) -> str:
    toks = enc.encode(text)
    return enc.decode(toks[:MAX_CTX]) if len(toks) > MAX_CTX else text

def to_ascii(text: str) -> str:
    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii")

# ── New robust JSON parser with fallback ────────────────────────────────────────
metrics = [
    "Coherence and Logic",
    "Relevance and Accuracy",
    "Readability and Style",
    "Grammatical Correctness",
    "Overall Impression"
]

def parse_json_reply(snippet: str) -> dict:
    """Return a flat dict of *_score and *_justification (NaN if missing)."""
    flat = {}
    # initialize all to NaN
    for m in metrics:
        key = m.replace(" ", "_")
        flat[f"{key}_score"]         = np.nan
        flat[f"{key}_justification"] = np.nan

    # find JSON braces
    start, end = snippet.find('{'), snippet.rfind('}')
    if start < 0 or end < 0:
        return flat

    js = snippet[start:end+1]
    # drop trailing commas
    js = re.sub(r",\s*([}\]])", r"\1", js)

    try:
        data = json.loads(js)
    except json.JSONDecodeError:
        return flat

    # fill in parsed values
    for m, info in data.items():
        key = m.replace(" ", "_")
        flat[f"{key}_score"]         = info.get("score", np.nan)
        flat[f"{key}_justification"] = info.get("justification", np.nan)
    return flat

# ── Identify your score/justification mapping ─────────────────────────────────
score_cols = [c for c in df_rest.columns if c.startswith("fw_opr_") and c.endswith("_score")]
just_cols  = {c: c.replace("_score", "_justification") for c in score_cols}

# ── Buffers for new columns ────────────────────────────────────────────────────
regenerated_fw   = []
evaluation_dicts = []

# ── Main Loop ─────────────────────────────────────────────────────────────────
for idx, row in df_rest.iterrows():
    # 1) find any low scores ≤ 3
    low = [
        c for c in score_cols
        if pd.to_numeric(row[c], errors="coerce") <= 3
    ]
    if not low:
        regenerated_fw.append(None)
        evaluation_dicts.append({})  # empty dict → all NaNs
        continue

    # 2) bullet-list their justifications
    bullets = "\n".join(f"- {row[just_cols[c]]}" for c in low)

    # 3) regeneration prompt
    base_prompt = futurw_work_generation_prompt
    inp_text    = row["Input_Text"]
    regen_p = (
        "The previous future work suggestions had these issues:\n"
        f"{bullets}\n\n"
        "Please regenerate an improved set of future work suggestions "
        "addressing these points, using this as your base prompt:\n\n"
        f"{base_prompt}\n\n{inp_text}"
    )
    regen_body = to_ascii(truncate_to_limit(regen_p))

    # 4) stream regeneration
    regen_text = ""
    regen_stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": regen_body}],
        stream=True,
        temperature=0.1
    )
    for chunk in regen_stream:
        regen_text += chunk.choices[0].delta.content or ""
    regen_text = regen_text.strip()
    regenerated_fw.append(regen_text)

    # 5) evaluation prompt
    author_str    = row["LLM_extracted_future_work"]
    eval_p = (
        prompt
        + "Author-Mentioned Future Work:\n" + author_str
        + "\n\nLLM-Generated Future Work:\n" + regen_text
    )
    eval_body = to_ascii(truncate_to_limit(eval_p))

    eval_text = ""
    eval_stream = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": eval_body}],
        stream=True,
        temperature=0
    )
    for chunk in eval_stream:
        eval_text += chunk.choices[0].delta.content or ""

    # 6) parse JSON (with NaN fallback)
    evaluation_dicts.append(parse_json_reply(eval_text))
