### This is an AI agent that takes in both an image and a text then gives out a response

In [1]:
from typing import TypedDict, Optional
from langgraph.graph import StateGraph, END, START
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
groq_api_key = os.getenv("groq_api_key")

In [3]:
llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    api_key=groq_api_key,
    temperature=0.2,
)

In [4]:
# BLIP Captioning Model (for images -> text)
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)
blip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [5]:
def caption_image(image_path: str) -> str:
    """Generate a caption for the input image using BLIP."""
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    out_ids = blip_model.generate(**inputs, max_new_tokens=64)
    caption = processor.decode(out_ids[0], skip_special_tokens=True)
    return caption

In [6]:
# ---- LangGraph State Definition ----
class AgentState(TypedDict):
    question: str
    image_path: Optional[str]
    text_answer: Optional[str]
    image_answer: Optional[str]

In [7]:

# ---- Workflow ----
workflow = StateGraph(AgentState)

In [8]:
def text_only_node(state: AgentState) -> AgentState:
    response = llm.invoke(state["question"])
    return {"text_answer": response.content}

def image_qna_node(state: AgentState) -> AgentState:
    caption = caption_image(state["image_path"])
    prompt = f"Image description: {caption}\nQuestion: {state['question']}"
    response = llm.invoke(prompt)
    return {"image_answer": response.content}

In [9]:
# def text_only_node(state: AgentState) -> AgentState:
#     prompt = ChatPromptTemplate.from_template(
#         "Answer the question: {question}"
#     )
#     chain = prompt | llm
#     response = chain.invoke({"question": state["question"]})
#     return {"answer": response.content}


In [10]:

# def image_qna_node(state: AgentState) -> AgentState:
#     caption = caption_image(state["image_path"])
#     prompt = ChatPromptTemplate.from_template(
#         "Image description: {caption}\nUser question: {question}\nAnswer concisely."
#     )
#     chain = prompt | llm
#     response = chain.invoke({"caption": caption, "question": state["question"]})
#     return {"answer": response.content} 

In [11]:
# Add nodes
workflow.add_node("text_only", text_only_node)
workflow.add_node("image_qna", image_qna_node)


<langgraph.graph.state.StateGraph at 0x23bd92849d0>

In [12]:

# --- Router logic (pure function for edges, not a node) ---
def router(state: AgentState) -> str:
    if state.get("image_path"):
        return "image_qna"
    return "text_only"


In [13]:

# Entry point
workflow.set_entry_point("text_only")  # dummy, will be replaced


<langgraph.graph.state.StateGraph at 0x23bd92849d0>

In [14]:

# Use START -> router conditional edges
workflow.add_conditional_edges(
    START,   # from graph entrypoint
    router,  # function returning a string
    {
        "text_only": "text_only",
        "image_qna": "image_qna"
    }
)

# End edges
workflow.add_edge("text_only", END)
workflow.add_edge("image_qna", END)


<langgraph.graph.state.StateGraph at 0x23bd92849d0>

In [15]:

# Compile
app = workflow.compile()



In [17]:
# ---- Test the usage ----
if __name__ == "__main__":
    # Text question
    result1 = app.invoke({"question": "What is an activation key in deep learning?"})
    print("Text Answer:", result1.get("text_answer"))

    # Image-related Q&A
    result2 = app.invoke({
        "question": "Interpret the Image of a graph given",
        "image_path": "bnb.jpg"
    })
    print("Image Answer:", result2.get("image_answer"))


Text Answer: In deep learning, an activation key, more commonly referred to as an activation function, is a mathematical function that is applied to the output of a neural network layer. The primary purpose of an activation function is to introduce non-linearity into the model, allowing it to learn and represent more complex relationships between the inputs and outputs.

Without activation functions, neural networks would only be able to learn linear relationships, which would limit their ability to model complex data. The activation function helps to:

1. **Introduce non-linearity**: By applying a non-linear transformation to the output of a layer, the model can learn to represent more complex relationships between the inputs and outputs.
2. **Control the output range**: Activation functions can help to control the range of the output values, which can be useful for certain types of problems, such as binary classification.
3. **Help with vanishing gradients**: Some activation function