# LLM As Judge 

In [None]:
# %reload_ext autoreload
# %autoreload 2

from src.LLM_as_judge.answer import V1_EXPLANATION_ANSWER_PROMPT

train_df = add_llm_explanation_column(
    df= train_df,
    system_prompts_dict=V1_ANSWER_SYSTEM_PROMPT,  # your prompt
    model_name="gpt-4",
    gold_col="gold_answer",
    pred_col="model_answer",
    new_col_name="answer_explanation"
)

train_df.head(2)


# Systematic way to add columns and prompts

In [None]:
system_prompts_cot = {
    "system_prompt": (
        "You are a helpful assistant.\n"
        "Please think step-by-step to find the correct answer.\n"
        "Only provide the answer as a float, with no symbols exept for - when required."
        "Then provide only the final answer.\n\n"
        "=== PRE TEXT ===\n{pre_text}\n\n"
        "=== POST TEXT ===\n{post_text}\n\n"
        "=== TABLE ===\n{table}\n\n"
    )
}

In [None]:
import openai
import pandas as pd

def construct_main_messages(record: dict, system_prompts: dict) -> list:
    """
    Builds the final messages by looking up a prompt in system_prompts["system_prompt"],
    then formatting placeholders {pre_text}, {post_text}, and {table} with data from record.
    """
    system_prompt_template = system_prompts["system_prompt"]
    
    system_message_content = system_prompt_template.format(
        pre_text=record.get("pre_text", ""),
        post_text=record.get("post_text", ""),
        table=record.get("table", "")  # always use record["table"]
    )

    messages = [
        {"role": "system", "content": system_message_content},
        {"role": "user", "content": record.get("question", "")}
    ]
    return messages


def process_records(records, system_prompts, model_name, prompt_style):
    """
    For each record, calls OpenAI and returns a DataFrame with [id, question, gold_answer, model_answer, model, prompt_style].
    """
    results = []
    for r in records:
        messages = construct_main_messages(r, system_prompts)
        try:
            response = openai.chat.completions.create(
                model=model_name,
                messages=messages
            )
            model_answer = response.choices[0].message.content.strip()
        except Exception as e:
            model_answer = f"Error: {e}"

        results.append({
            "id": r["id"],
            "question": r["question"],
            "gold_answer": r["gold_answer"],
            "table": r["table"],
            "model_answer": model_answer,
            "model": model_name,
            "prompt_style": prompt_style,
            "prompt": system_prompts,
        })
    return pd.DataFrame(results)

In [None]:
from src.models import gpt_4o_mini

df_run_cot = process_records(
    records=train_data[:2],
    system_prompts=V1_ANSWER_SYSTEM_PROMPT,
    model_name= gpt_4o_mini,
    prompt_style="CoT"
)

df_run_cot.head(2)

# Using o1 - The prompt structure is different 

In [None]:
# messages = [
#         {
#             "role": "user",
#             "content": f"""{Conversion_prompt}

#             QUESTION:
#             {question}
#             """

#             # Insert json relevant data somewhere, ask the question to help it answer the questions
#         }
#     ]

#     # 6. Call the OpenAI ChatCompletion endpoint
#     try:
#         response = openai.chat.completions.create(
#             model='o1-preview',
#             messages=messages

#         )
#         model_answer = response.choices[0].message.content
#     except Exception as e:
#         model_answer = f"Error: {e}"

# ---------------------------------------------------------------------

# Section 2 - Notes and questions 

# Final output - What will you show and send to Tomorro?


# Key Q's 


# 1.  How will you run the best prompt on the Dev set (also will reduce to like ~100 samples as well given costs)
# 1. How will you improve model performance? (criteria:systematic, repeatable, automatable???) 
# 1. How will the evals be presented? 
# 2. How will you track model and prompts?


### In notebook
1. Evals on 'accuracy' via exact-match with basic prompt and 4o.
2. Evals on program execution via exact match and specified criteria with basic prompt and 4o.
3. Improvements made via prompt-engineering (i) CoT & (ii) ReAct using 4o based on insights gathered in first experiments.
4. Improvements made via reasoning tooling to determine what differences were possible using reasoning models.
5. Fine-tunning to dertemine wheather further improvements were possible.


### Submission 
- A link to a repo with the current code structure and accompanying info in README.md 



### Must-have's:
1. All items in notebook
2. Submission



### Should
1. ...


### Could 
1. .. 



