#🧪 Practical: Create an Agent that Accepts Text + Image + Tools

  📥 Image upload

  📝 Text input

  🛠️ Tools: Image Captioning + Question Answering

#🎯 Objectives
By the end of this practical, you will:

Upload an image and text-based question together

Build an agent that:

Captions the image

Uses caption + question for question answering

Use LangGraph to orchestrate tools

Use Gemini 1.5 Flash (Free-tier) for all LLM work

🧩 Tools Required



In [1]:
!pip install langchain langchain-google-genai langgraph google-generativeai pillow



#🗂️ Folder Structure (Optional)

multimodal_agent/

├── app.py             # Main practical

├── sample.jpg         # Try your own image


#✅ Step-by-Step Practical (app.py)

In [2]:
import os
from typing import TypedDict, Annotated, Union
from PIL import Image
import base64
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, END

# Set your free-tier Gemini API Key
os.environ["GOOGLE_API_KEY"] = "AIzaSyCDyiafjDZo4pJf36HDz4QQtCgpCe2DD3E"


#🧠 Step 1: Define Shared State for Multimodal Agent

In [3]:
class MultiModalState(TypedDict):
    image_path: Annotated[str, "Path to input image"]
    user_question: Annotated[str, "User's text-based question"]
    image_caption: Annotated[str, "Caption generated from the image"]
    answer: Annotated[str, "Answer to user's question"]


#🧠 Step 2: Load Gemini 1.5 Flash (with Image Support)

In [4]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.4,
    convert_system_message_to_human=True
)


#📷 Step 3: Define Image Captioning Node

In [5]:
def image_caption_node(state: MultiModalState) -> MultiModalState:
    image_path = state['image_path']

    with open(image_path, "rb") as img_file:
        image_data = img_file.read()

    # Gemini accepts base64 encoded image in LangChain
    image_base64 = base64.b64encode(image_data).decode("utf-8")
    mimetype = "image/jpeg" if image_path.endswith(".jpg") else "image/png"

    response = llm.invoke(
        [
            {"role": "user", "content": [
                {"type": "text", "text": "Describe this image in one sentence."},
                {"type": "image_url", "image_url": {"url": f"data:{mimetype};base64,{image_base64}"}}
            ]}
        ]
    )

    print("🖼️ Image Caption:", response.content)
    return {**state, "image_caption": response.content}

#❓ Step 4: Define QA Node Using Caption + User Question

In [7]:
def question_answer_node(state: MultiModalState) -> MultiModalState:
    question = state['user_question']
    caption = state['image_caption']

    qa_prompt = f"""Image Description: {caption}
User Question: {question}
Answer:"""

    response = llm.invoke(qa_prompt)
    print("❓ Answer:", response.content)
    return {**state, "answer": response.content}


#🔄 Step 5: Build LangGraph and Connect Nodes

In [8]:
graph = StateGraph(MultiModalState)
graph.add_node("caption", image_caption_node)
graph.add_node("qa", question_answer_node)

# Flow: caption → qa → END
graph.set_entry_point("caption")
graph.add_edge("caption", "qa")
graph.add_edge("qa", END)

# Compile app
app = graph.compile()


#🚀 Step 6: Run the Agent on a Sample Image + Question

In [9]:
# Provide your own image path and question
image_path = "sample_image.jpg"  # Make sure this image exists
user_question = "What is the person doing in this image?"

# Run agent
initial_state = {
    "image_path": image_path,
    "user_question": user_question,
    "image_caption": "",
    "answer": ""
}

final_state = app.invoke(initial_state)

print("\n✅ Final Output:")
print("Caption:", final_state["image_caption"])
print("Answer:", final_state["answer"])




🖼️ Image Caption: An adorable Golden Retriever puppy lies in the grass, looking directly at the camera.




❓ Answer: There is no person in the image.  The image only shows a Golden Retriever puppy.

✅ Final Output:
Caption: An adorable Golden Retriever puppy lies in the grass, looking directly at the camera.
Answer: There is no person in the image.  The image only shows a Golden Retriever puppy.
