In [6]:
import autogen
import dotenv
import os

dotenv.load_dotenv()

GOOG_API_KEY = os.getenv("GOOGLE_API_KEY")
OAI_BASE = os.getenv("OPENAI_API_BASE")
OAI_KEY = os.getenv("OPENAI_API_KEY")

config_list = [
    {
        "model": "gemini-pro-vision",
        "api_key": GOOG_API_KEY,
        "api_type": "google"
    }
]

config_list_4v = [{
        "model": "gpt-4-vision-preview",
        "api_key": OAI_KEY,
        "base_url": OAI_BASE,
    }
]

In [16]:
# Load all the tools
from tools import TOOLS
def generate_llm_config(tool):
    # Define the function schema based on the tool's args_schema
    function_schema = {
        "name": tool.name.lower().replace (' ', '_'),
        "description": tool.description,
        "parameters": {
            "type": "object",
            "properties": {},
            "required": [],
        },
    }

    if tool.args is not None:
      function_schema["parameters"]["properties"] = tool.args

    return function_schema

llm_config = {
    "config_list": config_list_4v,
    "timeout": 120,
}

tools_map = {tool.name: tool._run for tool in TOOLS}

In [17]:
from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent

user_proxy = autogen.UserProxyAgent(
    name="User_proxy",
    system_message="A human admin.",
    human_input_mode="NEVER", # Try between ALWAYS or NEVER
    max_consecutive_auto_reply=0
)

# Register the tool and start the conversation
user_proxy.register_function(
    function_map=tools_map
)

image_agent = MultimodalConversableAgent("Vision Bot", 
                           llm_config=llm_config, 
                           max_consecutive_auto_reply=10,
                        )

In [18]:
user_proxy.initiate_chat(
    image_agent,
    message=f"""
    Describe the image.
<img https://ik.imagekit.io/sfwall/NY5_J2A4Si6Z6.png?updatedAt=1702987667392>
"""
)

[33mUser_proxy[0m (to Vision Bot):


    Describe the image.
<image>


--------------------------------------------------------------------------------
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mVision Bot[0m (to User_proxy):

The image shows a city street with high-rise buildings lining one side. The foreground

--------------------------------------------------------------------------------
