# LLM As Judge 

In [None]:
# %reload_ext autoreload
# %autoreload 2

from src.LLM_as_judge.answer import V1_EXPLANATION_ANSWER_PROMPT

train_df = add_llm_explanation_column(
    df= train_df,
    system_prompts_dict=V1_ANSWER_SYSTEM_PROMPT,  # your prompt
    model_name="gpt-4",
    gold_col="gold_answer",
    pred_col="model_answer",
    new_col_name="answer_explanation"
)

train_df.head(2)


# Systematic way to add columns and prompts

In [None]:
system_prompts_cot = {
    "system_prompt": (
        "You are a helpful assistant.\n"
        "Please think step-by-step to find the correct answer.\n"
        "Only provide the answer as a float, with no symbols exept for - when required."
        "Then provide only the final answer.\n\n"
        "=== PRE TEXT ===\n{pre_text}\n\n"
        "=== POST TEXT ===\n{post_text}\n\n"
        "=== TABLE ===\n{table}\n\n"
    )
}

In [None]:
import openai
import pandas as pd

def construct_main_messages(record: dict, system_prompts: dict) -> list:
    """
    Builds the final messages by looking up a prompt in system_prompts["system_prompt"],
    then formatting placeholders {pre_text}, {post_text}, and {table} with data from record.
    """
    system_prompt_template = system_prompts["system_prompt"]
    
    system_message_content = system_prompt_template.format(
        pre_text=record.get("pre_text", ""),
        post_text=record.get("post_text", ""),
        table=record.get("table", "")  # always use record["table"]
    )

    messages = [
        {"role": "system", "content": system_message_content},
        {"role": "user", "content": record.get("question", "")}
    ]
    return messages


def process_records(records, system_prompts, model_name, prompt_style):
    """
    For each record, calls OpenAI and returns a DataFrame with [id, question, gold_answer, model_answer, model, prompt_style].
    """
    results = []
    for r in records:
        messages = construct_main_messages(r, system_prompts)
        try:
            response = openai.chat.completions.create(
                model=model_name,
                messages=messages
            )
            model_answer = response.choices[0].message.content.strip()
        except Exception as e:
            model_answer = f"Error: {e}"

        results.append({
            "id": r["id"],
            "question": r["question"],
            "gold_answer": r["gold_answer"],
            "table": r["table"],
            "model_answer": model_answer,
            "model": model_name,
            "prompt_style": prompt_style,
            "prompt": system_prompts,
        })
    return pd.DataFrame(results)

In [None]:
from src.models import gpt_4o_mini

df_run_cot = process_records(
    records=train_data[:2],
    system_prompts=V1_ANSWER_SYSTEM_PROMPT,
    model_name= gpt_4o_mini,
    prompt_style="CoT"
)

df_run_cot.head(2)

# Using o1 - The prompt structure is different 

In [None]:
# messages = [
#         {
#             "role": "user",
#             "content": f"""{Conversion_prompt}

#             QUESTION:
#             {question}
#             """

#             # Insert json relevant data somewhere, ask the question to help it answer the questions
#         }
#     ]

#     # 6. Call the OpenAI ChatCompletion endpoint
#     try:
#         response = openai.chat.completions.create(
#             model='o1-preview',
#             messages=messages

#         )
#         model_answer = response.choices[0].message.content
#     except Exception as e:
#         model_answer = f"Error: {e}"

# ---------------------------------------------------------------------

# Section 2 - Notes and questions 

# Final output - What will you show and send to Tomorro?


# Key Q's 


# 1.  How will you run the best prompt on the Dev set (also will reduce to like ~100 samples as well given costs)
# 1. How will you improve model performance? (criteria:systematic, repeatable, automatable???) 
# 1. How will the evals be presented? 
# 2. How will you track model and prompts?


### In notebook
1. Evals on 'accuracy' via exact-match with basic prompt and 4o.
2. Evals on program execution via exact match and specified criteria with basic prompt and 4o.
3. Improvements made via prompt-engineering (i) CoT & (ii) ReAct using 4o based on insights gathered in first experiments.
4. Improvements made via reasoning tooling to determine what differences were possible using reasoning models.
5. Fine-tunning to dertemine wheather further improvements were possible.


### Submission 
- A link to a repo with the current code structure and accompanying info in README.md 



### Must-have's:
1. All items in notebook
2. Submission



### Should
1. ...


### Could 
1. .. 





import pandas as pd

# Example of using all 3 prompt variants:
prompt_variants = {
    "CoT": V1_COT_PROGRAM_SYSTEM_PROMPT ,
    "ReAct": v1_REACT_PROGRAM_SYSTEM_PROMPT
}

all_dfs = []
for style_name, prompt_dict in prompt_variants.items():
    # e.g. "CoT" => "cot_answer", "ReAct" => "react_answer"
    answer_col = f"{style_name.lower()}_answer"

    df_run = process_records(
        records=train_data[:2],
        system_prompts=prompt_dict,
        model_name=gpt_4o_mini,
        prompt_style=style_name,
        answer_col_name=answer_col
    )
    # store only the ID + newly created answer column + any other needed columns
    keep_cols = ["id", answer_col]
    # We can keep or remove columns depending on how you want to merge
    all_dfs.append(df_run[keep_cols + ["question", "gold_answer"]]) 
    
    all_dfs[0]

exact_match_score(master_df, "numeric_score")

# Tool calling 

In [None]:

tools = [
    {
        "type": "function",
        "function": {
            "name": "add",
            "description": "Add two numbers together.",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {"type": "number", "description": "The first number."},
                    "b": {"type": "number", "description": "The second number."}
                },
                "required": ["a", "b"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "subtract",
            "description": "Subtract one number from another.",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {"type": "number", "description": "The first number."},
                    "b": {"type": "number", "description": "The second number."}
                },
                "required": ["a", "b"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "multiply",
            "description": "Multiply two numbers together.",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {"type": "number", "description": "The first number."},
                    "b": {"type": "number", "description": "The second number."}
                },
                "required": ["a", "b"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "divide",
            "description": "Divide one number by another.",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {"type": "number", "description": "The numerator."},
                    "b": {"type": "number", "description": "The denominator (cannot be zero)."}
                },
                "required": ["a", "b"]
            }
        }
    }
]

In [None]:

V1_EXPLANATION_ANSWER_PROMPT = {
    "system_prompt": (
        "You are an explanation expert focusing on numeric discrepancies.\n"
        "You have a gold (correct) numeric answer and a predicted numeric answer.\n"
        "Your task is to compare these two answers and provide a short, clear explanation of why they might differ.\n"
        "If they match, simply confirm there is no discrepancy.\n\n"

        "=== PRE TEXT ===\n{pre_text}\n\n"
        "=== POST TEXT ===\n{post_text}\n\n"
        "=== HTML TABLE ===\n{table}\n\n"

        "Consider potential rounding, unit differences, or other reasons.\n"
        "Return only a brief explanation in plain text, no chain-of-thought.\n"
        "End of instructions."
    )
}




In [None]:
system_template = V1_EXPLANATION_ANSWER_PROMPT ["system_prompt"]
system_msg = system_template.format(
            pre_text=row.get("pre_text",""),
            post_text=row.get("post_text",""),
            table=row.get("html_table","")
        )

    {"role": "user", "content": q}
  ],
  logprobs=True,
  tools=tools
)

print(completion.choices[0].message)
print(completion.model)
print(completion.usage.completion_tokens_details.accepted_prediction_tokens)
print(completion.choices[0].logprobs)



train_data[1]
train_df.head()

In [None]:
train_data[1]


In [None]:
prompt = """
"Use the provided functions to answer questions."
"Your task is to create a mathematic program to answer the question given the information pre_text, post_text and html_table.\n"
"If they match, simply confirm there is no discrepancy.\n\n"
 "where each new operation references the previous result with #0, #1, etc.\n\n"
 For example Program: subtract(7525,7344), divide(#0,7344)n\n"
 "you may have to use more than one tool"
 Also response with your final answer in a a float point
'pre_text': 'fortron industries llc .fortron is a leading global producer of pps , sold under the fortron ae brand , which is used in a wide variety of automotive and other applications , especially those requiring heat and/or chemical resistance .fortron\'s facility is located in wilmington , north carolina .this venture combines the sales , marketing , distribution , compounding and manufacturing expertise of celanese with the pps polymer technology expertise of kureha america inc .cellulose derivatives strategic ventures .our cellulose derivatives ventures generally fund their operations using operating cash flow and pay dividends based on each ventures\' performance in the preceding year .in 2014 , 2013 and 2012 , we received cash dividends of $ 115 million , $ 92 million and $ 83 million , respectively .although our ownership interest in each of our cellulose derivatives ventures exceeds 20% ( 20 % ) , we account for these investments using the cost method of accounting because we determined that we cannot exercise significant influence over these entities due to local government investment in and influence over these entities , limitations on our involvement in the day-to-day operations and the present inability of the entities to provide timely financial information prepared in accordance with generally accepted accounting principles in the united states of america ( "us gaap" ) .2022 other equity method investments infraservs .we hold indirect ownership interests in several german infraserv groups that own and develop industrial parks and provide on-site general and administrative support to tenants .our ownership interest in the equity investments in infraserv affiliates are as follows : as of december 31 , 2014 ( in percentages ) .'
'post_text': 'research and development our businesses are innovation-oriented and conduct research and development activities to develop new , and optimize existing , production technologies , as well as to develop commercially viable new products and applications .research and development expense was $ 86 million , $ 85 million and $ 104 million for the years ended december 31 , 2014 , 2013 and 2012 , respectively .we consider the amounts spent during each of the last three fiscal years on research and development activities to be sufficient to execute our current strategic initiatives .intellectual property we attach importance to protecting our intellectual property , including safeguarding our confidential information and through our patents , trademarks and copyrights , in order to preserve our investment in research and development , manufacturing and marketing .patents may cover processes , equipment , products , intermediate products and product uses .we also seek to register trademarks as a means of protecting the brand names of our company and products .patents .in most industrial countries , patent protection exists for new substances and formulations , as well as for certain unique applications and production processes .however , we do business in regions of the world where intellectual property protection may be limited and difficult to enforce .confidential information .we maintain stringent information security policies and procedures wherever we do business .such information security policies and procedures include data encryption , controls over the disclosure and safekeeping of confidential information and trade secrets , as well as employee awareness training .trademarks .aoplus ae , aoplus ae2 , aoplus ae3 , ateva ae , avicor ae , britecoat ae , celanese ae , celanex ae , celcon ae , celfx 2122 , celstran ae , celvolit ae , clarifoil ae , duroset ae , ecovae ae , factor ae , fortron ae , gur ae , hostaform ae , impet ae , mowilith ae , nutrinova ae , qorus 2122 , riteflex ae , sunett ae , tcx 2122 , thermx ae , tufcor ae , vantage ae , vantageplus 2122 , vantage ae2 , vectra ae , vinamul ae , vitaldose ae , zenite ae and certain other branded products and services named in this document are registered or reserved trademarks or service marks owned or licensed by celanese .the foregoing is not intended to be an exhaustive or comprehensive list of all registered or reserved trademarks and service marks owned or licensed by celanese .fortron ae is a registered trademark of fortron industries llc. .',"
'html_table': '<table>\n  <thead>\n    <tr>\n      <th></th>\n      <th>as of december 31 2014 ( in percentages )</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr><td>infraserv gmbh & co . gendorf kg</td><td>39</td></tr>\n    <tr><td>infraserv gmbh & co . hoechst kg</td><td>32</td></tr>\n    <tr><td>infraserv gmbh & co . knapsack kg</td><td>27</td></tr>\n  </tbody>\n</table>',
"""
prompt



In [None]:
q = 'what was the percentage growth of the cash dividends from 2012 to 2014?'
p =  'subtract(115, 83), divide(#0, 83)'


In [None]:
from openai import OpenAI
thread = openai.beta.threads.create()
completion = openai.beta.assistants.create(
  model="gpt-4o",
  instructions= prompt,
  tools=tools
)

# print(completion.choices[0].message)
# print(completion.model)
# print(completion.usage.completion_tokens_details.accepted_prediction_tokens)
# print(completion.choices[0].logprobs)
thread = openai.beta.threads.create()
message = openai.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=q,
)

print(message)



In [None]:
from openai import OpenAI

completion = openai.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": prompt},