In [2]:
import sys
print(sys.executable)

/workspaces/Kaggle-Solution-Finder-Agent/app/.venv/bin/python


In [3]:
import os
from pathlib import Path

# 1) Redirect logs to eval_logs/
os.environ["LOGS_DIRECTORY"] = "eval_logs"

# 2) Make app/ importable
REPO_ROOT = Path.cwd().parent
sys.path.insert(0, str(REPO_ROOT / "app"))

from ingest import (read_repo_data,extract_completed_competitions,build_vector_index)
from search_agent import create_search_agent
from logs import log_interaction_to_file, LOG_DIR, evaluate_log_record, load_log_file, simplify_log_messages
from pydantic_ai import Agent
from logs import EvaluationChecklist

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
LOG_DIR.absolute()

PosixPath('/workspaces/Kaggle-Solution-Finder-Agent/eval/logs')

In [5]:
evaluation_prompt = """
Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).
    We also include the entire log (<LOG>) for analysis.

    For each item, check if the condition is met. 

    Checklist:

    - instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
    - instructions_avoid: The agent avoided doing things it was told not to do  
    - answer_relevant: The response directly addresses the user's question  
    - answer_clear: The answer is clear and correct  
    - answer_citations: The response includes proper citations or sources when required  
    - completeness: The response is complete and covers all key aspects of the request
    - tool_call_search: Is the search tool invoked? 

    Output true/false for each check and provide a short explanation for your judgment.
""".strip()

from pydantic import BaseModel
from pydantic_ai import Agent

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str
    
eval_agent = Agent(name = 'eval_agent',model='gpt-5-nano',
                       instructions=evaluation_prompt,
                       output_type=EvaluationChecklist)
                    
eval_agent

Agent(model=OpenAIChatModel(), name='eval_agent', end_strategy='early', model_settings=None, output_type=<class '__main__.EvaluationChecklist'>, instrument=None)

In [6]:
prompt_format = """
    <INSTRUCTIONS>{instructions}</INSTRUCTIONS>
    <QUESTION>{question}</QUESTION>
    <ANSWER>{answer}</ANSWER>
    <LOG>{log}</LOG>
    """.strip()
prompt_format

'<INSTRUCTIONS>{instructions}</INSTRUCTIONS>\n    <QUESTION>{question}</QUESTION>\n    <ANSWER>{answer}</ANSWER>\n    <LOG>{log}</LOG>'

In [7]:
import asyncio
import json

eval_set = []

# Load all log records
for log_file in LOG_DIR.glob('*.json'):
    if 'Query_Agent' not in log_file.name:
        continue
    log_record = load_log_file(log_file)
    eval_set.append(log_record)

eval_results = []

# Define an async function to handle the event loop
async def evaluate_logs():
    for log_record in eval_set:
        # Prepare the prompt using the log_record data
        log = json.dumps(simplify_log_messages(log_record["messages"]))
        prompt = prompt_format.format(
            instructions=log_record["system_prompt"],
            question=log_record["messages"][0]["parts"][0]["content"],
            answer=log_record["messages"][-1]["parts"][0]["content"],
            log=log
        )
        
        # Use run_in_executor to run eval_agent.run_sync in a separate thread/process
        eval_result = await asyncio.get_event_loop().run_in_executor(
            None,  # None means the default executor (i.e., use a thread pool)
            lambda: eval_agent.run_sync(prompt)  # Pass only the prompt to run_sync
        )
        
        eval_results.append((log_record, eval_result))

# Run the async function in the existing event loop
await evaluate_logs()

In [8]:
import pandas as pd
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content']}
    
    if hasattr(eval_result, 'output') and hasattr(eval_result.output, 'checklist'):
        checks = {c.check_name: c.check_pass for c in eval_result.output.checklist}
    else:
        # If checklist is not available, handle it here
        checks = {}

    row.update(checks)
    rows.append(row)

df_evals = pd.DataFrame(rows)
df_evals

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,uncertainty_explicitly_stated_in_strong_match_output
0,Query_Agent_20260120_024020_71f106.json,"If uncertainty exists, is it explicitly stated?",No strong matches were found for this query.,True,True,False,True,True,False,False,
1,Query_Agent_20260120_024356_59d1ac.json,Metric: Weighted Rowwise Root Mean Squared Error,### Strong Match\n\n1. **Competition Name:** O...,True,True,True,True,True,True,False,
2,Query_Agent_20260120_024910_0cc055.json,Are no required elements missing?,No strong matches were found for this query.,True,True,False,True,False,False,False,
3,Query_Agent_20260120_024721_78aa99.json,Does the answer address ALL explicit constrain...,"Yes, the answer addresses all explicit constra...",False,True,True,False,True,False,False,
4,Query_Agent_20260120_024039_86d9dd.json,"Are assumptions, inferences, or uncertainties ...",### Strong Match\n\n1. **Competition Name:** M...,False,False,False,False,False,False,True,
...,...,...,...,...,...,...,...,...,...,...,...
72,Query_Agent_20260120_024128_923022.json,Does the answer address ALL explicit constrain...,"Yes, the answer addresses all explicit constra...",False,True,True,True,True,False,False,
73,Query_Agent_20260120_024027_8d02ba.json,Are results correctly classified (Strong / Par...,No strong matches were found for this query.,True,True,True,True,True,True,True,
74,Query_Agent_20260120_024220_0d111d.json,Are all relevant items included?,### Strong Match\n\n1. **Competition Name:** E...,True,True,False,False,True,False,True,
75,Query_Agent_20260120_024853_1af440.json,Please provide the solutions for the Diabetic ...,### Strong Match\n\n1. **Competition Name:** D...,True,True,True,True,True,True,True,


In [10]:
eval_cols = [
    'instructions_follow',
    'instructions_avoid',
    'answer_relevant',
    'answer_clear',
    'answer_citations',
    'completeness',
    'tool_call_search']

df_evals[eval_cols] = df_evals[eval_cols].apply(lambda col: col.map({True: 1, False: 0}))
df_evals

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,uncertainty_explicitly_stated_in_strong_match_output
0,Query_Agent_20260120_024020_71f106.json,"If uncertainty exists, is it explicitly stated?",No strong matches were found for this query.,1.0,1.0,0.0,1.0,1.0,0.0,0.0,
1,Query_Agent_20260120_024356_59d1ac.json,Metric: Weighted Rowwise Root Mean Squared Error,### Strong Match\n\n1. **Competition Name:** O...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,
2,Query_Agent_20260120_024910_0cc055.json,Are no required elements missing?,No strong matches were found for this query.,1.0,1.0,0.0,1.0,0.0,0.0,0.0,
3,Query_Agent_20260120_024721_78aa99.json,Does the answer address ALL explicit constrain...,"Yes, the answer addresses all explicit constra...",0.0,1.0,1.0,0.0,1.0,0.0,0.0,
4,Query_Agent_20260120_024039_86d9dd.json,"Are assumptions, inferences, or uncertainties ...",### Strong Match\n\n1. **Competition Name:** M...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...
72,Query_Agent_20260120_024128_923022.json,Does the answer address ALL explicit constrain...,"Yes, the answer addresses all explicit constra...",0.0,1.0,1.0,1.0,1.0,0.0,0.0,
73,Query_Agent_20260120_024027_8d02ba.json,Are results correctly classified (Strong / Par...,No strong matches were found for this query.,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
74,Query_Agent_20260120_024220_0d111d.json,Are all relevant items included?,### Strong Match\n\n1. **Competition Name:** E...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,
75,Query_Agent_20260120_024853_1af440.json,Please provide the solutions for the Diabetic ...,### Strong Match\n\n1. **Competition Name:** D...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,


In [11]:
print('Evaluate performance of AI Agent')
df_evals[eval_cols].mean() * 100

Evaluate performance of AI Agent


instructions_follow    72.368421
instructions_avoid     92.105263
answer_relevant        51.315789
answer_clear           65.789474
answer_citations       90.789474
completeness           44.736842
tool_call_search       51.315789
dtype: float64