In [1]:
from langchain.prompts import ChatPromptTemplate
from google import genai
from PIL import Image
from langchain_google_genai.llms import GoogleGenerativeAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
import os

load_dotenv()


image = Image.open("family.jpeg")

client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=["What is this image?", image])

text = response.text

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 50 , chunk_overlap = 10)
text = text_splitter.split_text(text)
text

['The image shows a family of five lying on the',
 'on the ground in a park or outdoor setting. The',
 'The father is lying on the bottom, with one child',
 'one child sitting on his back. The mother and two',
 'and two other children are next to them, all',
 'them, all smiling at the camera. The ground is',
 'ground is covered with grass and fallen leaves,',
 'leaves, suggesting it is autumn. The background',
 'is blurred, with trees showing autumn colors.']

In [3]:

db = Chroma.from_texts(text , OllamaEmbeddings(model="nomic-embed-text"))
ret = db.as_retriever()
db.similarity_search("autumn")

[Document(metadata={}, page_content='leaves, suggesting it is autumn. The background'),
 Document(metadata={}, page_content='is blurred, with trees showing autumn colors.'),
 Document(metadata={}, page_content='ground is covered with grass and fallen leaves,'),
 Document(metadata={}, page_content='on the ground in a park or outdoor setting. The')]

In [5]:

prompt = ChatPromptTemplate.from_template(
    """
Based on the given image description explain the user's queries,
Make sure to explain it in detail.
Dont try to go beyond what is asked.
Image:
{context}
Query:
{input}
"""
)
llm = GoogleGenerativeAI(model="gemini-2.0-flash")




combine = create_stuff_documents_chain(llm = llm , prompt=prompt)
retrieval_chain = create_retrieval_chain(ret , combine_docs_chain=combine )

In [6]:
retrieval_chain.invoke({"input":"What is in the image?"})

{'input': 'What is in the image?',
 'context': [Document(metadata={}, page_content='The image shows a family of five lying on the'),
  Document(metadata={}, page_content='leaves, suggesting it is autumn. The background'),
  Document(metadata={}, page_content='is blurred, with trees showing autumn colors.'),
  Document(metadata={}, page_content='them, all smiling at the camera. The ground is')],
 'answer': 'The user\'s query "What is in the image?" is asking for a description of the visual elements present in the provided image. Based on the image description you gave, the response should include that there is a family of five lying on leaves, indicating it\'s autumn. The background has blurred trees with autumn colors. The family is smiling at the camera.\n'}