In [None]:
! pip install "pyautogen>=0.2.18"

In [37]:
import os
from datetime import datetime
from typing import Callable, Dict, Literal, Optional, Union

from typing_extensions import Annotated

from autogen import (
    Agent,
    AssistantAgent,
    ConversableAgent,
    GroupChat,
    GroupChatManager,
    UserProxyAgent,
    config_list_from_json,
    register_function,
)
from autogen.agentchat.contrib import agent_builder
from autogen.cache import Cache
from autogen.coding import DockerCommandLineCodeExecutor, LocalCommandLineCodeExecutor

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "REPLACE_ME")

LLM_CONFIG = {
    "config_list": [
        {"model": "gpt-4", "api_key": OPENAI_API_KEY}
    ]
}

In [38]:
task = """
Create a synthetic dataset for training text-to-code models. 
The dataset should include various types of natural language descriptions and their corresponding code snippets.
The code should be in Python, and the dataset should cover a range of programming concepts and tasks. 

Each entry in the dataset should consist of the following fields:
ID: A unique identifier for each entry.
Natural Language Description: A detailed and clear description of the programming task or problem.
Code: The corresponding Python code that solves the problem described.
Complexity: In a scale from 1 to 5 with 1 being very easy to 5 being very hard
"""
print(task)


Create a synthetic dataset for training text-to-code models. 
The dataset should include various types of natural language descriptions and their corresponding code snippets.
The code should be in Python, and the dataset should cover a range of programming concepts and tasks. 

Each entry in the dataset should consist of the following fields:
ID: A unique identifier for each entry.
Natural Language Description: A detailed and clear description of the programming task or problem.
Code: The corresponding Python code that solves the problem described.
Complexity: In a scale from 1 to 5 with 1 being very easy to 5 being very hard



## Approach 1. Two-agent chat with function call for task decomposition

In [42]:
# Create planner agent.
planner = AssistantAgent(
    name="planner",
    llm_config=LLM_CONFIG,
    system_message="You are a helpful AI assistant. You suggest a feasible plan "
    "for finishing a complex task of high quality data generation by decomposing it into 5-6 sub-tasks. "
    "If the plan is not good, detailed a clear, think about it again step-by-step and suggest a better plan. "
    
)

# Create a planner user agent used to interact with the planner.
planner_user = UserProxyAgent(
    name="planner_user",
    human_input_mode="NEVER",
    code_execution_config=False,
    llm_config=LLM_CONFIG
)

# The function for asking the planner.
def task_planner(question: Annotated[str, "Question to ask the planner."]) -> str:
    with Cache.disk(cache_seed=4) as cache:
        planner_user.initiate_chat(planner, message=question, max_turns=1, cache=cache)
    # return the last message received from the planner
    return planner_user.last_message()["content"]

# Create assistant agent.
assistant = AssistantAgent(
    name="assistant",
    system_message="You are a helpful AI assistant with experience in generating step-by-step plans for high quality synthetic datasets"
    "You can use the task planner to decompose the ask in request into sub-tasks."
    "Make sure your follow through the sub-tasks."
    "Give the user a final solution at the end."
    "Return TERMINATE only if the sub-tasks are completed.",
    llm_config=LLM_CONFIG
)

# Setting up code executor.
os.makedirs("planning", exist_ok=True)
# Use DockerCommandLineCodeExecutor to run code in a docker container.
# code_executor = DockerCommandLineCodeExecutor(work_dir="planning")
code_executor = LocalCommandLineCodeExecutor(work_dir="planning")

# Create user proxy agent used to interact with the assistant.
user_proxy = UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    llm_config=LLM_CONFIG,
    is_termination_msg=lambda x: "content" in x
    and x["content"] is not None
    and x["content"].rstrip().endswith("TERMINATE"),
    code_execution_config={"executor": code_executor},
)

# Register the function to the agent pair.
register_function(
    task_planner,
    caller=assistant,
    executor=user_proxy,
    name="task_planner",
    description="A task planner than can help you with decomposing a complex task into sub-tasks.",
)

# Use Cache.disk to cache LLM responses. Change cache_seed for different responses.
with Cache.disk(cache_seed=1) as cache:
    # the assistant receives a message from the user, which contains the task description
    user_proxy.initiate_chat(
        assistant,
        message=task,
        cache=cache,
    )

[33muser_proxy[0m (to assistant):


Create a synthetic dataset for training text-to-code models. 
The dataset should include various types of natural language descriptions and their corresponding code snippets.
The code should be in Python, and the dataset should cover a range of programming concepts and tasks. 

Each entry in the dataset should consist of the following fields:
ID: A unique identifier for each entry.
Natural Language Description: A detailed and clear description of the programming task or problem.
Code: The corresponding Python code that solves the problem described.
Complexity: In a scale from 1 to 5 with 1 being very easy to 5 being very hard


--------------------------------------------------------------------------------
[33massistant[0m (to user_proxy):

[32m***** Suggested tool call (call_gRBmkbbtJe7gThPQULJ5xqfd): task_planner *****[0m
Arguments: 
{
  "question": "How to create a synthetic dataset for training text-to-code models?"
}
[32m****************

# Approach 2. Group Chat

In [41]:
user_proxy = UserProxyAgent(
    name="Admin",
    system_message="A human admin. Give the task, and send instructions to writer to refine the request.",
    code_execution_config=False,
    human_input_mode="NEVER"
)

planner = AssistantAgent(
    name="Planner",
    system_message="""Planner. Given a task, please determine what information is needed to complete the task.
""",
    llm_config={"config_list": config_list, "cache_seed": None},
)

engineer = AssistantAgent(
    name="Data Engineer",
    llm_config={"config_list": config_list, "cache_seed": None},
    system_message="""Engineer. You design a step-by-step plan for a diverse dataset requested. If your plan does not loook good, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try.
""",
)

writer = AssistantAgent(
    name="Writer",
    llm_config={"config_list": config_list, "cache_seed": None},
    system_message="""Writer. Please write a detail step-by-step plan as designed by the data engineer. You will write it for a task based on previous chat history. Don't write any code.""",
)

groupchat = GroupChat(
    agents=[user_proxy, engineer, writer, planner],
    messages=[],
    max_round=20,
    speaker_selection_method="auto",
)
manager = GroupChatManager(groupchat=groupchat, llm_config={"config_list": config_list, "cache_seed": None})

# Use Cache.disk to cache LLM responses. Change cache_seed for different responses.
with Cache.disk(cache_seed=41) as cache:
    chat_history = user_proxy.initiate_chat(
        manager,
        message=task,
        cache=cache,
    )

[33mAdmin[0m (to chat_manager):


Create a synthetic dataset for training text-to-code models. 
The dataset should include various types of natural language descriptions and their corresponding code snippets.
The code should be in Python, and the dataset should cover a range of programming concepts and tasks. 

Each entry in the dataset should consist of the following fields:
ID: A unique identifier for each entry.
Natural Language Description: A detailed and clear description of the programming task or problem.
Code: The corresponding Python code that solves the problem described.
Complexity: In a scale from 1 to 5 with 1 being very easy to 5 being very hard


--------------------------------------------------------------------------------
[32m
Next speaker: Data Engineer
[0m
[33mData Engineer[0m (to chat_manager):

Creating a synthetic dataset for training text-to-code models involves careful planning to ensure the dataset is diverse, balanced, and of high quality. Here’s a ste

BadRequestError: Error code: 400 - {'error': {'message': "Invalid 'messages[2].name': string does not match pattern. Expected a string that matches the pattern '^[a-zA-Z0-9_-]+$'.", 'type': 'invalid_request_error', 'param': 'messages[2].name', 'code': 'invalid_value'}}

# Approach 3. Auto Build

In [None]:
AUTOBUILD_SYSTEM_MESSAGE = """You are a manager of a group of advanced experts, your primary objective is to delegate the resolution of tasks to other experts through structured dialogue and derive conclusive insights from their conversation summarization.
When a task is assigned, it's crucial to assess its constraints and conditions for completion. If feasible, the task should be divided into smaller, logically consistent subtasks. Following this division, you have the option to address these subtasks by forming a team of agents using the "autobuild" tool.
Upon the completion of all tasks and verifications, you should conclude the operation and reply "TERMINATE".
"""

user_proxy = UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    code_execution_config=False,
)

autobuild_assistant = AssistantAgent(
    name="Autobuild Assistant",
    llm_config={"config_list": config_list, "cache_seed": None},
)


def autobuild_reply(recipient, messages, sender, config):
    last_msg = messages[-1]["content"]
    builder = agent_builder.AgentBuilder(
        # config_file_or_env="/Users/ekzhu/autogen/OAI_CONFIG_LIST",
        builder_model="gpt-4-turbo",
        agent_model="gpt-4-turbo",
    )
    agent_list, agent_configs = builder.build(
        last_msg, default_llm_config={"config_list": config_list, "cache_seed": None}
    )
    # start nested chat
    nested_group_chat = GroupChat(
        agents=agent_list,
        messages=[],
    )
    manager = GroupChatManager(groupchat=nested_group_chat, llm_config={"config_list": config_list, "cache_seed": None})
    chat_res = agent_list[0].initiate_chat(
        manager, message=agent_configs.get("building_task", last_msg), summary_method="reflection_with_llm"
    )
    return True, chat_res.summary


autobuild_assistant.register_reply([Agent, None], autobuild_reply)

with Cache.disk(cache_seed=41) as cache:
    user_proxy.initiate_chat(autobuild_assistant, message=task, max_turns=1)

[33muser_proxy[0m (to Autobuild Assistant):


Create a synthetic dataset for training text-to-code models. 
The dataset should include various types of natural language descriptions and their corresponding code snippets.
The code should be in Python, and the dataset should cover a range of programming concepts and tasks. 

Each entry in the dataset should consist of the following fields:
ID: A unique identifier for each entry.
Natural Language Description: A detailed and clear description of the programming task or problem.
Code: The corresponding Python code that solves the problem described.
Complexity: In a scale from 1 to 5 with 1 being very easy to 5 being very hard


--------------------------------------------------------------------------------


JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)