In [1]:
import sys
print(sys.executable)

/workspaces/Kaggle-Solution-Finder-Agent/app/.venv/bin/python


In [30]:
import os
from pathlib import Path

# 1) Redirect logs to eval_logs/
os.environ["LOGS_DIRECTORY"] = "eval_logs"

# 2) Make app/ importable
REPO_ROOT = Path.cwd().parent
sys.path.insert(0, str(REPO_ROOT / "app"))

from ingest import (read_repo_data,extract_completed_competitions,build_vector_index)
from search_agent import create_search_agent
from logs import log_interaction_to_file, LOG_DIR, evaluate_log_record, load_log_file, simplify_log_messages
from pydantic_ai import Agent
from logs import EvaluationChecklist

In [22]:
LOG_DIR.absolute()

PosixPath('/workspaces/Kaggle-Solution-Finder-Agent/eval/logs')

In [55]:
evaluation_prompt = """
 ## Prompt: Evaluator AI

Use the following checklist to evaluate the quality of an AI agent's response (<ANSWER>) to a user question (<QUESTION>) based on the log of interactions (<LOG>).

For each item, check if the condition is met:

1. **instructions_follow**: Did the AI follow the user's instructions as provided in <INSTRUCTIONS>?
2. **instructions_avoid**: Did the AI avoid doing things it was explicitly told not to do? (e.g., avoid summarizing or generating irrelevant information)
3. **answer_relevant**: Does the answer directly and specifically address the question asked? The answer should be clearly focused on the context of the competition and its characteristics (e.g., evaluation metric, model techniques, dataset).
4. **answer_clear**: Is the response clear, well-structured, and easy to understand? The answer should not be vague or ambiguous.
5. **answer_citations**: Are sources or references (if required) provided in the answer? For example, if a specific competition's results or discussions are referenced, proper citations should be included.
6. **completeness**: Does the response comprehensively cover all key aspects of the question? If the question asks for multiple details (e.g., modeling techniques and data handling), they should all be included.
7. **tool_call_search**: Did the agent call the search tool to retrieve relevant information or use the most appropriate dataset? Ensure the answer references the Kaggle competition directly if applicable.

### Output format:
Provide a **True/False** answer for each checklist item, followed by a brief explanation of your judgment.

#### Example output:
- **instructions_follow:** True — The agent adhered to all specified instructions and did not diverge from the task.
- **instructions_avoid:** True — The agent avoided generating irrelevant or generic questions.
- **answer_relevant:** True — The answer specifically addressed the question about the evaluation metric.
- **answer_clear:** True — The response was structured logically and easy to understand.
- **answer_citations:** True — Proper citations were included for the Kaggle competition’s evaluation metric and solution links.
- **completeness:** True — The answer covered all aspects of the question, including dataset details and model techniques.
- **tool_call_search:** True — The agent used the search tool to provide specific Kaggle discussion links relevant to the competition.
""".strip()

from pydantic import BaseModel
from pydantic_ai import Agent

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str
    
eval_agent = Agent(name = 'eval_agent',model='gpt-5-nano',
                       instructions=evaluation_prompt,
                       output_type=EvaluationChecklist)
                    
eval_agent

Agent(model=OpenAIChatModel(), name='eval_agent', end_strategy='early', model_settings=None, output_type=<class '__main__.EvaluationChecklist'>, instrument=None)

In [56]:
prompt_format = """
    <INSTRUCTIONS>{instructions}</INSTRUCTIONS>
    <QUESTION>{question}</QUESTION>
    <ANSWER>{answer}</ANSWER>
    <LOG>{log}</LOG>
    """.strip()
prompt_format

'<INSTRUCTIONS>{instructions}</INSTRUCTIONS>\n    <QUESTION>{question}</QUESTION>\n    <ANSWER>{answer}</ANSWER>\n    <LOG>{log}</LOG>'

In [57]:
import asyncio
import json

eval_set = []

# Load all log records
for log_file in LOG_DIR.glob('*.json'):
    if 'Query_Agent' not in log_file.name:
        continue
    log_record = load_log_file(log_file)
    eval_set.append(log_record)

eval_results = []

# Define an async function to handle the event loop
async def evaluate_logs():
    for log_record in eval_set:
        # Prepare the prompt using the log_record data
        log = json.dumps(simplify_log_messages(log_record["messages"]))
        prompt = prompt_format.format(
            instructions=log_record["system_prompt"],
            question=log_record["messages"][0]["parts"][0]["content"],
            answer=log_record["messages"][-1]["parts"][0]["content"],
            log=log
        )
        
        # Use run_in_executor to run eval_agent.run_sync in a separate thread/process
        eval_result = await asyncio.get_event_loop().run_in_executor(
            None,  # None means the default executor (i.e., use a thread pool)
            lambda: eval_agent.run_sync(prompt)  # Pass only the prompt to run_sync
        )
        
        eval_results.append((log_record, eval_result))

# Run the async function in the existing event loop
await evaluate_logs()

In [58]:
import pandas as pd
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content']}
    
    if hasattr(eval_result, 'output') and hasattr(eval_result.output, 'checklist'):
        checks = {c.check_name: c.check_pass for c in eval_result.output.checklist}
    else:
        # If checklist is not available, handle it here
        checks = {}

    row.update(checks)
    rows.append(row)

df_evals = pd.DataFrame(rows)
df_evals

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,analysis_1_solution_implication
0,Query_Agent_20260119_184901_f9c973.json,What does the Categorization Accuracy metric m...,### Strong Match\n\n1. **Competition Name:** A...,True,True,False,False,True,False,True,
1,Query_Agent_20260119_185030_d89ee3.json,With only 1 solution submitted for the 'Cause-...,### Strong Match\n\n1. **Competition Name:** C...,,,,,,,,True
2,Query_Agent_20260119_185224_b07c06.json,What are the key factors contributing to custo...,### Strong Match\n\n1. **Competition Name:** G...,False,True,False,False,True,False,True,
3,Query_Agent_20260119_185320_393268.json,How does the title 'Image Matching Challenge 2...,### Strong Match\n\n1. **Competition Name:** I...,True,True,False,True,True,False,True,
4,Query_Agent_20260119_185034_d40338.json,What does the evaluation metric 58266_TpuGraph...,### Strong Match\n\n1. **Competition Name:** G...,True,True,False,False,True,False,True,
5,Query_Agent_20260119_184851_504c6a.json,How does the title 'Bike Sharing Demand' refle...,### Strong Match\n\n1. **Competition Name:** B...,False,True,False,False,True,False,True,
6,Query_Agent_20260119_184619_3b57c1.json,What are the key domains and topics covered in...,### Strong Match\n\n1. **Competition Name:** A...,True,True,False,True,True,False,True,
7,Query_Agent_20260119_184945_7e8e9a.json,Given that there are only 2 solutions submitte...,No strong matches were found for this query.,True,True,True,True,True,True,True,
8,Query_Agent_20260119_185054_ae1257.json,What specific challenges do participants face ...,### Strong Match\n\n1. **Competition Name:** G...,True,True,False,True,True,False,True,
9,Query_Agent_20260119_185333_943a59.json,What is the primary domain of the Image Matchi...,### Strong Match\n\n1. **Competition Name:** I...,True,True,False,True,True,False,True,


In [52]:
eval_cols = [
    'instructions_follow',
    'instructions_avoid',
    'answer_relevant',
    'answer_clear',
    'answer_citations',
    'completeness',
    'tool_call_search']

df_evals[eval_cols] = df_evals[eval_cols].apply(lambda col: col.map({True: 1, False: 0}))
df_evals

Unnamed: 0,file,question,answer,instructions_follow,instructions_avoid,answer_relevant,answer_clear,answer_citations,completeness,tool_call_search,tool_usage_confirmation
0,Query_Agent_20260119_181715_8e4388.json,How is the Mean Absolute Error calculated in t...,### Strong Match\n\n1. **Competition Name:** F...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
1,Query_Agent_20260119_180645_3bf21b.json,What is the structure of the dataset provided ...,### Strong Match\n\n1. **Competition Name:** P...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,
2,Query_Agent_20260119_181500_a465a8.json,What machine learning algorithms have proven m...,### Strong Match\n\n1. **Competition Name:** I...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,
3,Query_Agent_20260119_181616_f2f1aa.json,How is the dataset structured for this competi...,### Strong Match\n\n1. **Competition Name:** S...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,
4,Query_Agent_20260119_181442_8aa145.json,How is the Weighted Multiclass Loss calculated...,### Strong Match\n\n1. **Competition Name:** P...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,
5,Query_Agent_20260119_181347_636db2.json,What specific features are available in the da...,### Strong Match\n\n1. **Competition Name:** L...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,
6,Query_Agent_20260119_181511_fead5a.json,What are the main features provided in the dat...,### Strong Match\n\n1. **Competition Name:** G...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,
7,Query_Agent_20260119_181842_4c6cc6.json,What are some best practices to follow when tu...,### Strong Match\n\n1. **Competition Name:** i...,,,,,,,,True
8,Query_Agent_20260119_181018_6a3435.json,What best practices can participants follow to...,### Strong Match\n\n1. **Competition Name:** G...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,
9,Query_Agent_20260119_181404_88516b.json,What are some effective strategies or techniqu...,### Strong Match\n\n1. **Competition Name:** D...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,


In [59]:
print('Evaluate performance of AI Agent')
df_evals[eval_cols].mean() * 100

Evaluate performance of AI Agent


instructions_follow    69.230769
instructions_avoid     87.179487
answer_relevant         2.564103
answer_clear           76.923077
answer_citations       94.871795
completeness            2.564103
tool_call_search       94.871795
dtype: object