# API Key

In [18]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

# Execution Team

## Define Agents

In [None]:
from agent import ExecutionAgent

execution_agent = ExecutionAgent()

planner = execution_agent.planner
# executor = execution_agents.executor
executor = execution_agent.pipeline_executor # 測試 Pipeline Executor
replanner = execution_agent.replanner
solver = execution_agent.solver

In [None]:
# response = planner.invoke({"user_input": [("user", "Please help me apply leave application.")]})
# for step in response.steps:
#     print(step)

In [None]:
import time

execution_agent.web_execution_tool.create_browser()

start_time = time.time()

response = executor.invoke({"messages": [("user", "Step 1. function_name: 'navigate_with_url', parameters: '{\"url\":\"https://cis.ncu.edu.tw/iNCU/stdAffair/leaveRequest\"}'")]})
print(response["messages"][-1].content)
end_time = time.time()

print(f"Execution time: {end_time - start_time} seconds")

execution_agent.web_execution_tool.selenium_controller.clean_containers() # *selenium controller解構子有問題，必須runtime內清除

In [None]:
print(response)

## Define Graph State

In [None]:
import operator
from typing import Annotated, List, Tuple, Any
from typing_extensions import TypedDict


class PlanExecute(TypedDict):
    input: str
    plan: List[str]
    past_steps: Annotated[List[Tuple], operator.add]
    response: str
    history: List[Tuple[str, Any]]

## Define Agent Node

In [None]:
async def plan_step(state: PlanExecute):
    plan = await planner.ainvoke({"user_input": [("user", state["input"])]}) # 對應到planner system prompt中的{user_input}
    state["history"].append(("Planner", plan.steps)) # 將plan的步驟加入history中

    return {
        "plan": plan.steps,
        "history": state["history"],
    }

async def execute_step(state: PlanExecute):
    plan = state["plan"]
    plan_str = "\n".join(f"{i+1}. {step}" for i, step in enumerate(plan))
    task = plan[0]
    task_formatted = f"""For the following plan:
{plan_str}\n\nYou are tasked with executing step {1}, {task}."""
    agent_response = await executor.ainvoke({"messages": [("user", task_formatted)]}) # react agent 用 messages 方式接收訊息
    state["history"].append(("Executor", (task, agent_response["messages"][-1].content)))

    return {
        "past_steps": [(task, agent_response["messages"][-1].content)], # react agent 接收訊息方式
        "history": state["history"],
    }

async def replan_step(state: PlanExecute):
    # 過濾掉state中不需要的欄位
    temp_state = state.copy()
    temp_state.pop("history")

    output = await replanner.ainvoke(temp_state)
    if isinstance(output.action, execution_agent.Response):
        state["history"].append(("Replanner", output.action.response))
        return {
            "response": output.action.response,
            "history": state["history"],
        }
    else:
        state["history"].append(("Replanner", output.action.steps))
        return {
            "plan": output.action.steps,
            "history": state["history"],
        }

async def solve_step(state: PlanExecute):
    print("history:")
    print(state["history"])
    response = await solver.ainvoke({"user_input": state["input"], "planning_history": state["history"]})
    return {"response": response.content, "history": state["history"]}

def should_end(state: PlanExecute):
    if "response" in state and state["response"]:
        return "solver"
    else:
        return "executor"

## Create Graph

In [None]:
from langgraph.graph import StateGraph, START, END

execution_workflow = StateGraph(PlanExecute)

execution_workflow.add_node("planner", plan_step)
execution_workflow.add_node("executor", execute_step)
execution_workflow.add_node("replanner", replan_step)
execution_workflow.add_node("solver", solve_step)

execution_workflow.add_edge(START, "planner")
execution_workflow.add_edge("planner", "executor")
execution_workflow.add_edge("executor", "replanner")
execution_workflow.add_conditional_edges(
    "replanner",
    # Next, we pass in the function that will determine which node is called next.
    should_end,
    ["executor", "solver"],
)
execution_workflow.add_edge("solver", END)

execution_app = execution_workflow.compile() # This compiles it into a LangChain Runnable, meaning you can use it as you would any other runnable

In [None]:
from IPython.display import Image, display
from PIL import Image as PILImage
from io import BytesIO

graph_bytes = execution_app.get_graph(xray=True).draw_mermaid_png()

# output_file_path = "test.jpg"
# with BytesIO(graph_bytes) as byte_stream:
#     image = PILImage.open(byte_stream)
#     image.save(output_file_path, format="PNG")

display(Image(graph_bytes))

In [None]:
# import time
# import nest_asyncio
# nest_asyncio.apply()

# start_time = time.time()
# result = tool_dict["website_links_crawler"].invoke({"link": "https://pdc.adm.ncu.edu.tw/#&panel1-1"})
# # website_links_crawler("https://www.ncu.edu.tw/tw/")



# end_time = time.time()
# execution_time = end_time - start_time

# print(f"Execution time: {execution_time} seconds")

## Run App

In [None]:
import nest_asyncio
import time

with open("Outputs/execution_chat_log.txt", "w") as f:
    f.write("")

def write_to_chat_log(content):
    with open("Outputs/execution_chat_log.txt", "a") as f:
        f.write(content)

# Who is the headmaster of National Central University in Taiwan?
# Summarize the content of the 111 Academic Affairs Regulations.
# Please help me gather information related to scholarship applications.
# Please help me fill out the leave application on the school website.
config = {"recursion_limit": 30}
inputs = {
    "input": "Please help me gather information related to scholarship applications.",
    "history": [], # 初始化儲存History的list
}
write_to_chat_log(f"User Query:\n{inputs['input']}\n\n")

# tool_dict["create_browser"].invoke(input=None)

nest_asyncio.apply()
start_time = time.time()
async for event in execution_app.astream(inputs, config=config):
    for agent, state in event.items():
        if agent != "__end__":
            write_to_chat_log(f"{agent}:\n")

            for key, value in state.items():
                if (key != "history"):
                    write_to_chat_log(f"{key}: {value}\n")
            
            write_to_chat_log("\n")
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
# del tools.selenium_controller

# Evaluation Team

## Define Agents

### Critic

In [None]:
from utils.factory import AgentFactory

# * 根據使用者輸入和計畫制定生成評估標準
critic = AgentFactory.create_react_agent_with_yaml("Critic")

In [None]:
# response = critic.invoke({"messages": [("user", "Please evaluate the performance of execution team.")]})

# # 暫存評估標準，之後儲存到state內交給evaluator
# with open("Docs/evaluation_rubric.txt", "w") as f:
#     f.write(f"{response['messages'][-1].content}\n\n")

In [None]:
# print(response["messages"][-1].content)

In [None]:
# 查看調用工具情形
# for message in response["messages"]:
#     print(message)
#     if not message.content:
#         for item in message:
#             print(item)

### Evaluator

In [36]:
from tool import EvaluationTool
from utils.factory import AgentFactory
from typing_extensions import List, TypedDict
from pydantic import BaseModel, Field

class EvaluatorResponse(BaseModel):
    """Return a list of score for each step in order."""

    scores: List[float] = Field(
        description="A list of scores for each step in order.",
    )

evaluation_tool = EvaluationTool()
    
# * 根據評估者提供的評估框架和評估執行團隊的任務執行成效
evaluator = AgentFactory.create_react_agent_with_yaml("Evaluator", evaluation_tool.tool_dict, EvaluatorResponse)

Evaluator_llm_config:
model: gpt-4.1-mini
temperature: 0

Evaluator_prompt: 
You are an Evaluator Agent in a multi-agent system tasked with assessing the performance of the Execution Team based on a predefined evaluation rubric.

You will be given a structured evaluation rubric created by the Critic Agent, with detailed expectations for each step of the original plan, including fallback evaluation rules for steps introduced later through replanning.

You should use read_execution_chat_log to get the chat log that captures the actual actions and responses produced by the Execution Team while executing a multi-step plan.

Your goal is to evaluate how well each executed step aligns with the rubric. This includes:
  - Identifying each step from the execution log and linking it to a rubric entry (original or fallback)
  - For each original plan step, refer to its corresponding **Evaluation Criteria** defined in the rubric to guide your scoring and analysis
  - Consider any **additional step

In [37]:
evaluation_rubric = """
Evaluation Rubric for Plan Steps Addressing Scholarship Application Information Gathering

Step 1: Identify the most relevant National Central University website or webpage related to scholarships.
- Step Objective: Locate the primary official webpage that contains scholarship information for National Central University.
- Linked Requirements: E1 - URL of scholarship information page.
- Expected Input/Output: Input - User query; Output - Accurate URL of the scholarship information page.
- Failure Indicators: Incorrect or unrelated URL; failure to identify an official or relevant page.
- Fallback Evaluation Rules: If a similar step identifies a different but still official and relevant scholarship page, evaluate based on URL relevance and authority.
- Evaluation Criteria:
  - Information Quality: URL must be official and directly related to scholarships.
  - Alignment with Requirements: Must correspond to E1 requirement.
  - Step Efficiency: Should be a single, clear URL without unnecessary multiple links.
  - Clarity of Expression: URL and page description should be clearly stated.

Step 2: Access and read the content of the identified scholarship information page.
- Step Objective: Retrieve and understand the content of the identified scholarship webpage.
- Linked Requirements: E2 - Content of the scholarship page.
- Expected Input/Output: Input - URL from Step 1; Output - Textual content of the page.
- Failure Indicators: Content is incomplete, inaccessible, or irrelevant.
- Fallback Evaluation Rules: If content is partially retrieved or from a slightly different page, evaluate completeness and relevance.
- Evaluation Criteria:
  - Information Quality: Content must be accurate and comprehensive.
  - Alignment with Requirements: Must fulfill E2 content retrieval.
  - Step Efficiency: Content retrieval should be direct without unnecessary detours.
  - Clarity of Expression: Content should be clearly presented or summarized.

Step 3: Evaluate whether the content provides comprehensive details about scholarship applications, including eligibility, deadlines, and application procedures.
- Step Objective: Assess sufficiency of the retrieved content regarding key scholarship application details.
- Linked Requirements: E3 - Sufficiency judgment.
- Expected Input/Output: Input - Content from Step 2; Output - Judgment on content sufficiency.
- Failure Indicators: Incorrect or vague sufficiency assessment.
- Fallback Evaluation Rules: Similar evaluations should consider completeness of key details.
- Evaluation Criteria:
  - Information Quality: Judgment must be based on thorough content analysis.
  - Alignment with Requirements: Must address eligibility, deadlines, and procedures.
  - Step Efficiency: Evaluation should be concise and focused.
  - Clarity of Expression: Judgment should be clearly stated with rationale.

Step 4: If the content is insufficient, find and list hyperlinks on the page that may lead to more detailed scholarship application information.
- Step Objective: Identify additional resources or links for more detailed scholarship information.
- Linked Requirements: E4 - List of relevant links.
- Expected Input/Output: Input - Insufficient content judgment; Output - List of relevant hyperlinks.
- Failure Indicators: Irrelevant or no links provided despite insufficiency.
- Fallback Evaluation Rules: Alternative link lists should be relevant and lead to detailed info.
- Evaluation Criteria:
  - Information Quality: Links must be relevant and credible.
  - Alignment with Requirements: Must correspond to E4.
  - Step Efficiency: List should be concise and focused.
  - Clarity of Expression: Links should be clearly described.

Step 5: Follow the most relevant link from Step 4 and read the new page content for additional scholarship application details.
- Step Objective: Retrieve additional scholarship information from the selected link.
- Linked Requirements: E5 - New page content.
- Expected Input/Output: Input - Selected link; Output - Content of the new page.
- Failure Indicators: Content is irrelevant, incomplete, or inaccessible.
- Fallback Evaluation Rules: If alternative links are followed, evaluate content relevance and completeness.
- Evaluation Criteria:
  - Information Quality: Content must add meaningful details.
  - Alignment with Requirements: Must fulfill E5.
  - Step Efficiency: Retrieval should be direct and purposeful.
  - Clarity of Expression: Content should be clearly presented.

Step 6: Extract and compile the final information about scholarship applications, including eligibility criteria, required documents, deadlines, and application process.
- Step Objective: Produce a comprehensive summary of scholarship application information.
- Linked Requirements: E6 - Final compiled information.
- Expected Input/Output: Input - Content from Steps 2 and/or 5; Output - Clear, complete summary.
- Failure Indicators: Missing key details, unclear or disorganized summary.
- Fallback Evaluation Rules: Alternative compilations should cover all key aspects comprehensively.
- Evaluation Criteria:
  - Information Quality: Summary must be accurate, complete, and detailed.
  - Alignment with Requirements: Must address all required scholarship application elements.
  - Step Efficiency: Summary should be concise yet comprehensive.
  - Clarity of Expression: Information should be well-structured and easy to understand.

This rubric supports detailed evaluation of each plan step's execution quality and alignment with the user's task of gathering scholarship application information.
"""

In [38]:
# with open('Docs/evaluation_rubric.txt', 'r') as file:
#     evaluation_rubric = file.read()

response = evaluator.invoke({"messages": [("user", evaluation_rubric)]})

# 暫存評估結果，之後儲存到state內交給analyzer
# with open("evaluation_result.txt", "w") as f:
#     f.write(f"{response['messages'][-1].content}\n\n")

In [39]:
print(response["messages"][-1].content)

- Step ID or Summary: Identify the most relevant National Central University scholarship webpage
- Rubric Reference: Step 1
- Execution Summary: The execution team identified two URLs initially, one at admission.ncu.edu.tw which was relevant but later found inaccessible. Eventually, they found a detailed and official scholarship page at http://military.ncu.edu.tw/scholarship.php, which contained comprehensive scholarship information.
- Score: 1 (Fully Met)
- Justification: The final URL identified is an official National Central University page dedicated to scholarships, fulfilling the requirement for an official and relevant URL. The process was iterative but resulted in a clear, single URL with relevant content.
- Improvement Suggestions: The team could improve efficiency by verifying link accessibility earlier to avoid following inaccessible URLs.

- Step ID or Summary: Access and read the content of the identified scholarship information page
- Rubric Reference: Step 2
- Execution 

In [42]:
print(response)
print(response["structured_response"].scores)
print(type(response["structured_response"].scores))
print(type(response["structured_response"].scores[0]))

{'messages': [HumanMessage(content="\nEvaluation Rubric for Plan Steps Addressing Scholarship Application Information Gathering\n\nStep 1: Identify the most relevant National Central University website or webpage related to scholarships.\n- Step Objective: Locate the primary official webpage that contains scholarship information for National Central University.\n- Linked Requirements: E1 - URL of scholarship information page.\n- Expected Input/Output: Input - User query; Output - Accurate URL of the scholarship information page.\n- Failure Indicators: Incorrect or unrelated URL; failure to identify an official or relevant page.\n- Fallback Evaluation Rules: If a similar step identifies a different but still official and relevant scholarship page, evaluate based on URL relevance and authority.\n- Evaluation Criteria:\n  - Information Quality: URL must be official and directly related to scholarships.\n  - Alignment with Requirements: Must correspond to E1 requirement.\n  - Step Efficien

In [35]:
def compute_progress_rate(x: List[float]) -> float:
    if not x:
        return 0.0

    K = len(x)
    max_progress = 0.0

    x_list = []
    for i in range(len(x)):
        temp_list = []
        for j in range(len(x)):
            if j <= i:
                temp_list.append(x[j])
            else:
                temp_list.append(0)
        x_list.append(temp_list)

    for i in range(len(x_list)):
        progress_i = sum(x_list[i]) / K
        max_progress = max(max_progress, progress_i)

    return max_progress

x = [1, 1, 0.5, 0, 0, 0]

subgoal_progress_reate = compute_progress_rate(x)
print(subgoal_progress_reate)

# x_list = []
# for i in range(len(x)):
#     temp_list = []
#     for j in range(len(x)):
#         if j <= i:
#             temp_list.append(x[j])
#         else:
#             temp_list.append(0)
#     x_list.append(temp_list)
# print(x_list)

0.4166666666666667


In [None]:
# # 查看調用工具情形
# for message in response["messages"]:
#     print(message)
#     if not message.content:
#         for item in message:
#             print(item)

## Define Graph State

In [None]:
from typing_extensions import TypedDict

class Evaluation(TypedDict):
    input: str
    rubric: str
    result: str
    judgment: str

## Define Agent Node

In [None]:
async def critic_step(state: Evaluation):
    response = await critic.ainvoke({"messages": [("user", state["input"])]})
    state["rubric"] = response["messages"][-1].content # 儲存評估標準到state內
    return {
        "rubric": state["rubric"],
    }

async def evaluator_step(state: Evaluation):
    response = await evaluator.ainvoke({"messages": [("user", state["rubric"])]})
    state["result"] = response["messages"][-1].content # 儲存評估結果到state內
    return {
        "result": state["result"],
    }

## Create Graph

In [None]:
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display

evaluation_workflow = StateGraph(Evaluation)

evaluation_workflow.add_node("critic", critic_step)
evaluation_workflow.add_node("evaluator", evaluator_step)

evaluation_workflow.add_edge(START, "critic")
evaluation_workflow.add_edge("critic", "evaluator")
evaluation_workflow.add_edge("evaluator", END)

evaluation_app = evaluation_workflow.compile() # This compiles it into a LangChain Runnable, meaning you can use it as you would any other runnable

In [None]:
display(Image(evaluation_app.get_graph(xray=True).draw_mermaid_png()))

## Run App

In [None]:
import time

with open("Outputs/evaluation_chat_log.txt", "w") as f:
    f.write("")

def write_to_chat_log(content):
    with open("Outputs/evaluation_chat_log.txt", "a") as f:
        f.write(content)

# Please evaluate the performance of execution team.
config = {"recursion_limit": 50}
inputs = {
    "input": "Please evaluate the performance of execution team.",
}
write_to_chat_log(f"Evaluation Query:\n{inputs['input']}\n\n")

start_time = time.time()
async for event in evaluation_app.astream(inputs, config=config):
    for agent, state in event.items():
        if agent != "__end__":
            write_to_chat_log(f"{agent}:\n")

            for key, value in state.items():
                if (key != "history"):
                    write_to_chat_log(f"{key}: {value}\n")
            
            write_to_chat_log("\n")
end_time = time.time()

evaluation_time = end_time - start_time
print(f"Evaluation time: {evaluation_time:.2f} seconds")

# Evolution Team

## Define Agents

### Analyzer

In [None]:
# from langchain_openai import ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate

# analyzer_llm_config = agents_parameter["Analyzer"]["llm_config"]
# analyzer_system_prompt = agents_parameter["Analyzer"]["prompt"]

# analyzer_llm = ChatOpenAI(model=analyzer_llm_config["model"], temperature=analyzer_llm_config["temperature"])
# analyzer_prompt = ChatPromptTemplate.from_template(analyzer_system_prompt)

# analyzer = analyzer_prompt | analyzer_llm

# print("analyzer_llm_config:")
# for key, value in analyzer_llm_config.items():
#     print(f"{key}: {value}")
# print("analyzer_system_prompt: \n" + analyzer_system_prompt)

In [None]:
from utils.factory import AgentFactory

analyzer = AgentFactory.create_react_agent_with_yaml("Analyzer")

In [None]:
# response = analyzer.invoke({"messages": [("user", "Please analyze the evaluation result of the execution team.")]})

In [None]:
# print(response["messages"][-1].content)

### Prompt Optimizer

In [None]:
from utils.factory import AgentFactory
from pydantic import BaseModel, Field

class Optimization_Response(BaseModel):
    """Optimization response to user."""
    
    updated_agent_system_prompt: str = Field(
        description="The complete updated system prompt for the agent that is most responsible for the identified issue."
    )

prompt_optimizer = AgentFactory.create_react_agent_with_yaml("Prompt Optimizer")

In [None]:
analysis = """
analysis: All steps in the evaluation report were scored as Fully Met. There are some improvement suggestions mentioned, but none indicate clear underperformance or partial fulfillment of the task. Therefore, I will analyze the improvement suggestions to see if any step shows clear room for improvement that warrants responsibility attribution.

Step 1: URL identification was appropriate; suggestion is to justify URL choice more clearly. This is a Planner-related improvement.

Step 2: Content extraction was relevant; suggestion is to summarize content relevance explicitly. This relates to Executor's communication of results.

Step 3: Sufficiency assessment was accurate; suggestion is to state criteria explicitly. This is a Planner responsibility to define assessment criteria.

Step 4: Relevant links identified; suggestion to avoid non-functional links. This is an Executor detail in link selection.

Step 5: Redirecting search was efficient; suggestion to document rationale earlier. This relates to Replanner's decision-making transparency.

Step 6: Final extraction accurate; suggestion to include direct citation. This is Executor's presentation of results.

Additional replanning steps: Effective replanning; suggestion to document decision-making more explicitly. This is Replanner responsibility.

Summary of improvement suggestions:
- Planner: Justify URL choice, state sufficiency criteria explicitly
- Executor: Summarize content relevance, avoid non-functional links, include citations
- Replanner: Document replanning decisions more explicitly

None of these suggestions indicate failure or partial fulfillment, only room for clearer communication and documentation.

Hence, no step shows clear underperformance. The overall task outcome was successful with all steps fully met.

Final judgment:
- No agent caused underperformance.
- Minor improvements are distributed among Planner, Executor, and Replanner.
- Since the plan was solid and execution was correct, and replanning was effective, the overall responsibility is balanced.
- If forced to select the primary responsible agent for minor improvements, the Planner could be highlighted for improving clarity in plan justification and assessment criteria.

---

**Primary Responsible Agent**: Planner  
**Justification for Final Attribution**: The Planner could improve by explicitly justifying URL choices and clearly stating sufficiency criteria, which would enhance clarity and reduce ambiguity in the execution process. These foundational improvements would benefit the entire workflow.  
**Summary of Issues**: Minor suggestions for clearer documentation and communication in plan justification, content relevance assessment, and replanning rationale; no failures or partial completions."""
# response = prompt_optimizer.invoke({"messages": [("user", f"Analysis: \n{analysis}")]})

In [None]:
# print(response["messages"][-1].content)

In [None]:
# print(response)

## Define Graph State

In [None]:
from typing_extensions import TypedDict

class Evolution(TypedDict):
    input: str
    analysis: str
    result: str
    # updated_agent_system_prompt: str

## Define Agent Node

In [None]:
async def analyze_step(state: Evolution):
    response = await analyzer.ainvoke({"messages": [("user", state["input"])]})
    return {
        "analysis": response["messages"][-1].content # 儲存分析結果到state內
    }

async def prompt_optimize_step(state: Evolution):
    response = await prompt_optimizer.ainvoke({"messages": [("user", state["analysis"])]})
    
    return {
        "result": response["messages"][-1].content, # 儲存最終回覆到state內,
        # "updated_agent_system_prompt": response["structured_response"].updated_agent_system_prompt # 儲存更新過後的prompt到state內
    }

## Create Graph

In [None]:
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display

evolution_workflow = StateGraph(Evolution)

evolution_workflow.add_node("analyzer", analyze_step)
evolution_workflow.add_node("prompt_optimizer", prompt_optimize_step)

evolution_workflow.add_edge(START, "analyzer")
evolution_workflow.add_edge("analyzer", "prompt_optimizer")
evolution_workflow.add_edge("prompt_optimizer", END)

evolution_app = evolution_workflow.compile() # This compiles it into a LangChain Runnable, meaning you can use it as you would any other runnable

In [None]:
display(Image(evolution_app.get_graph(xray=True).draw_mermaid_png()))

## Run App

In [None]:
import time

with open("Outputs/evolution_chat_log.txt", "w") as f:
    f.write("")

def write_to_chat_log(content):
    with open("Outputs/evolution_chat_log.txt", "a") as f:
        f.write(content)

# Please analyze the evaluation result of the execution team.
config = {"recursion_limit": 50}
inputs = {
    "input": "Please analyze the evaluation result of the execution team.",
}
write_to_chat_log(f"Evolution Query:\n{inputs['input']}\n\n")

start_time = time.time()
async for event in evolution_app.astream(inputs, config=config):
    for agent, state in event.items():
        if agent != "__end__":
            write_to_chat_log(f"{agent}:\n")

            for key, value in state.items():
                if (key != "history"):
                    write_to_chat_log(f"{key}: {value}\n")
            
            write_to_chat_log("\n")
end_time = time.time()

evolution_time = end_time - start_time
print(f"Evolution time: {evolution_time:.2f} seconds")