In [None]:
# In a new Colab notebook
# First, install the necessary libraries
!pip install langchain langchain_openai chromadb pypdf langchain-community

import os
import asyncio
from typing import List, Dict, Any

# Set up your OpenAI API key
try:
    from google.colab import userdata
    os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
except ImportError:
    print("Not in a Colab environment, assuming API key is set.")

In [None]:
# --- 1. Re-create a simplified version of our agent ---
# In a real project, you would import this from your agent.py file.
# For this notebook, we'll redefine it here to keep it self-contained.
from datetime import datetime
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.agents import tool, create_openai_tools_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Define the tools
@tool
def get_current_time(tool_input: str = "") -> str:
    """Returns the current date and time."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

@tool
def get_knowledge_from_library(query: str) -> str:
    """Searches a library about the Transformer architecture and attention."""
    persist_directory = './chroma_db_rag'
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
    retriever = vector_store.as_retriever(search_kwargs={"k": 2}) # Get 2 docs
    docs = retriever.get_relevant_documents(query)
    return "\n---\n".join([doc.page_content for doc in docs])

tools = [get_current_time, get_knowledge_from_library]

# Create the agent
llm = ChatOpenAI(model="gpt-4o", temperature=0)
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])
agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)

In [None]:
# --- 2. Define the Test Set ---
# This is our "exam" for the agent. Each item has an input and the expected outcome.
test_set = [
    {
        "name": "Time Inquiry",
        "input": "What time is it right now?",
        "expected_tool": "get_current_time",
        "expected_keywords": [str(datetime.now().year), ":"] # Check for year and a colon
    },
    {
        "name": "RAG Inquiry",
        "input": "Explain self-attention in one sentence.",
        "expected_tool": "get_knowledge_from_library",
        "expected_keywords": ["self-attention", "sequence", "dependencies"]
    },
    {
        "name": "No Tool Inquiry",
        "input": "Write a short poem about AI.",
        "expected_tool": None, # Expect no tool to be called
        "expected_keywords": ["robot", "learn", "code", "mind"]
    }
]

In [None]:
# --- 3. The LLM-as-a-Judge Evaluator ---
judge_llm = ChatOpenAI(model="gpt-4o", temperature=0)

async def evaluate_agent_run(test_case: Dict[str, Any]) -> Dict[str, Any]:
    """
    Runs a single test case against the agent and uses an LLM to judge the result.
    """
    print(f"\n--- Running Test Case: {test_case['name']} ---")
    input_question = test_case["input"]

    # Use astream_events to capture the full trace of the agent's run
    events = []
    async for event in agent_executor.astream_events({"input": input_question}, version="v1"):
        events.append(event)

    # Extract key information from the events stream
    final_answer = ""
    tool_calls = []
    for event in events:
        if event["event"] == "on_tool_start":
            tool_calls.append(event["name"])
        if event["event"] == "on_chat_model_stream":
            final_answer += event["data"]["chunk"].content

    print(f"Final Answer: {final_answer}")
    print(f"Tools Called: {tool_calls}")

    # Now, use the LLM to judge the performance
    evaluation_prompt = f"""
    You are an expert evaluator for AI agents. Your task is to score an agent's performance
    on a given task based on the provided criteria.

    TASK:
    - User's Question: "{input_question}"
    - Expected Tool to be Used: {test_case['expected_tool']}
    - Expected Keywords in Final Answer: {test_case['expected_keywords']}

    AGENT'S PERFORMANCE:
    - Tools Actually Called: {tool_calls}
    - Final Answer Provided: "{final_answer}"

    EVALUATION CRITERIA:
    1.  **Tool Correctness (Score 0-1):** Score 1 if the agent called the single expected tool, or no tool if none was expected. Score 0 otherwise.
    2.  **Answer Relevance (Score 0-1):** Score 1 if the final answer is relevant and directly addresses the user's question. Score 0 otherwise.
    3.  **Keyword Presence (Score 0-1):** Score 1 if the final answer contains ALL of the expected keywords. Score 0 otherwise.

    Please provide your evaluation as a JSON object with three keys: "tool_score", "answer_score", "keyword_score".
    Do not include any other text or explanation.
    """

    judge_response = await judge_llm.ainvoke(evaluation_prompt)

    try:
        scores = json.loads(judge_response.content)
    except:
        # Fallback if the LLM doesn't return perfect JSON
        scores = {"tool_score": 0, "answer_score": 0, "keyword_score": 0}

    print(f"Evaluation Scores: {scores}")
    return scores

In [None]:
# --- 4. Run the Full Evaluation Suite ---
async def run_suite():
    import json # Import json for parsing the judge's response
    results = {}
    for case in test_set:
        scores = await evaluate_agent_run(case)
        results[case["name"]] = scores

    print("\n\n--- Evaluation Suite Complete ---")
    # Calculate and print average scores
    avg_tool_score = sum(res["tool_score"] for res in results.values()) / len(results)
    avg_answer_score = sum(res["answer_score"] for res in results.values()) / len(results)
    avg_keyword_score = sum(res["keyword_score"] for res in results.values()) / len(results)

    print(f"\nAverage Tool Correctness Score: {avg_tool_score:.2f}")
    print(f"Average Answer Relevance Score: {avg_answer_score:.2f}")
    print(f"Average Keyword Presence Score: {avg_keyword_score:.2f}")

In [None]:
# Run the async function
# In a notebook, you can run an async function at the top level with await
# await run_suite()
# Or, use asyncio.run()
import asyncio
import json
# asyncio.run(run_suite())
await run_suite()

In [None]:
# In a new Colab notebook

# =================================================================
# Cell 1: Setup and Dependencies
# =================================================================
print("--- Installing Dependencies ---")
!pip install langchain langchain_openai chromadb pypdf langchain-community -q

import os
import asyncio
import json
import re
from typing import List, Dict, Any
from datetime import datetime

# Set up your OpenAI API key
try:
    from google.colab import userdata
    os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
    print("OpenAI API key loaded successfully.")
except ImportError:
    print("Not in a Colab environment, assuming API key is set.")

# =================================================================
# Cell 2: Ingestion - Create the Knowledge Base
# This cell creates the './chroma_db_rag' directory needed by our RAG tool.
# =================================================================
print("\n--- Running Ingestion to Create Knowledge Base ---")
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 1. Load the document
loader = PyPDFLoader("https://arxiv.org/pdf/1706.03762.pdf")
documents = loader.load()

# 2. Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

# 3. Embed and Store
persist_directory = './chroma_db_rag'
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)
print("--- Knowledge Base Created Successfully ---")


# =================================================================
# Cell 3: Agent and Evaluation Suite
# =================================================================
print("\n--- Defining Agent and Evaluation Suite ---")
from langchain_openai import ChatOpenAI
from langchain.agents import tool, create_openai_tools_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# --- 1. Define the Tools ---
@tool
def get_current_time(tool_input: str = "") -> str:
    """Returns the current date and time."""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

@tool
def get_knowledge_from_library(query: str) -> str:
    """Searches a library about the Transformer architecture and attention."""
    try:
        persist_directory = './chroma_db_rag'
        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
        vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
        retriever = vector_store.as_retriever(search_kwargs={"k": 2})
        docs = retriever.get_relevant_documents(query)
        return "\n---\n".join([doc.page_content for doc in docs])
    except Exception as e:
        return f"Error accessing knowledge base: {e}"

tools = [get_current_time, get_knowledge_from_library]
llm = ChatOpenAI(model="gpt-4o", temperature=0)
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])
agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)

# --- 2. Define the Test Set ---
test_set = [
    {"name": "Time Inquiry", "input": "What time is it right now?", "expected_tool": "get_current_time", "expected_keywords": [str(datetime.now().year), ":"]},
    {"name": "RAG Inquiry", "input": "Explain self-attention in one sentence.", "expected_tool": "get_knowledge_from_library", "expected_keywords": ["self-attention", "sequence", "dependencies"]},
    {"name": "No Tool Inquiry", "input": "Write a short poem about AI.", "expected_tool": None, "expected_keywords": ["robot", "learn", "code", "mind"]}
]

# --- 3. The LLM-as-a-Judge Evaluator ---
judge_llm = ChatOpenAI(model="gpt-4o", temperature=0)

def extract_json_from_string(text: str) -> Dict[str, Any]:
    match = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL)
    if match:
        json_str = match.group(1)
    else:
        json_str = text
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from the string: {text}")
        return {}

async def evaluate_agent_run(test_case: Dict[str, Any]) -> Dict[str, Any]:
    print(f"\n--- Running Test Case: {test_case['name']} ---")
    input_question = test_case["input"]
    events = []
    async for event in agent_executor.astream_events({"input": input_question}, version="v1"):
        events.append(event)
    final_answer, tool_calls = "", []
    for event in events:
        if event["event"] == "on_tool_start": tool_calls.append(event["name"])
        if event["event"] == "on_chat_model_stream": final_answer += event["data"]["chunk"].content

    print(f"Final Answer: {final_answer}")
    print(f"Tools Called: {tool_calls}")

    # --- THIS IS THE FULL, CORRECTED PROMPT ---
    evaluation_prompt = f"""
    You are an expert evaluator for AI agents. Your task is to score an agent's performance
    on a given task based on the provided criteria.

    TASK:
    - User's Question: "{input_question}"
    - Expected Tool to be Used: {test_case['expected_tool']}
    - Expected Keywords in Final Answer: {test_case['expected_keywords']}

    AGENT'S PERFORMANCE:
    - Tools Actually Called: {tool_calls}
    - Final Answer Provided: "{final_answer}"

    EVALUATION CRITERIA:
    1.  **Tool Correctness (Score 0-1):** Score 1 if the agent called the single expected tool (or no tool if none was expected). Score 0 otherwise.
    2.  **Answer Relevance (Score 0-1):** Score 1 if the final answer is relevant and directly addresses the user's question. Score 0 otherwise.
    3.  **Keyword Presence (Score 0-1):** Score 1 if the final answer contains ALL of the expected keywords. Score 0 otherwise.

    Please provide your evaluation as a JSON object with three keys: "tool_score", "answer_score", "keyword_score".
    Do not include any other text or explanation. Wrap the JSON in a ```json code block.
    """

    judge_response = await judge_llm.ainvoke(evaluation_prompt)
    scores = extract_json_from_string(judge_response.content)
    final_scores = {"tool_score": scores.get("tool_score", 0), "answer_score": scores.get("answer_score", 0), "keyword_score": scores.get("keyword_score", 0)}
    print(f"Evaluation Scores: {final_scores}")
    return final_scores

# --- 4. Run the Full Evaluation Suite ---
async def run_suite():
    results = {}
    for case in test_set:
        scores = await evaluate_agent_run(case)
        results[case["name"]] = scores

    print("\n\n--- Evaluation Suite Complete ---")
    if not results:
        print("No results to display.")
        return

    avg_tool_score = sum(res["tool_score"] for res in results.values()) / len(results)
    avg_answer_score = sum(res["answer_score"] for res in results.values()) / len(results)
    avg_keyword_score = sum(res["keyword_score"] for res in results.values()) / len(results)

    print(f"\nAverage Tool Correctness Score: {avg_tool_score:.2f}")
    print(f"Average Answer Relevance Score: {avg_answer_score:.2f}")
    print(f"Average Keyword Presence Score: {avg_keyword_score:.2f}")

# =================================================================
# Cell 4: Execute the Suite
# =================================================================
await run_suite()