### Imports

In [1]:
import os
import numpy as np
import json
from PIL import Image
from sklearn.preprocessing import normalize



In [3]:
import torch
import torchvision.transforms as transforms
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.embeddings.base import Embeddings
from transformers import CLIPProcessor, CLIPModel
from chromadb.config import Settings
from langchain_google_genai import ChatGoogleGenerativeAI

### Global Vars

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
api_key="AIzaSyA2Wybrg_FPSdEJwMVapGDT-8T5i2nsXi4"

In [7]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

In [9]:
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

### Loading Data

In [10]:
json_path = 'flipkart_iphones_with_reviews_qna2.json'
with open(json_path, 'r', encoding='utf-8') as f:
    products = json.load(f)

In [11]:
from langchain.document_loaders import JSONLoader


loader = JSONLoader(
    file_path='flipkart_iphones_with_reviews_qna2.json',  
    jq_schema='.',               
    text_content=False,
    json_lines=False
)

product_docs = loader.load()



In [12]:
from langchain.docstore.document import Document
import json

flipkart_docs_processed = []

for product in product_docs:
    individual_product = json.loads(product.page_content)

    for product in individual_product:

        metadata = {
            "title": product['name'],
            "id": "product-iphone8",  # you can generate a real ID here if needed
            "source": "Flipkart",
            "page": 1
        }

        data = ' '.join([
            product['description'],
            ' '.join(product['specifications']),
            ' '.join(product['reviews']),
            ' '.join(product['qna'])
        ])

        flipkart_docs_processed.append(Document(page_content=data, metadata=metadata))


### Load and Process PDF Research Papers with Contextual Information (we only have JSON data rn so its not needed 
)

### Functions

In [14]:
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = blip_model.generate(**inputs)
        caption = blip_processor.decode(output[0], skip_special_tokens=True)

    return caption

In [15]:
def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        image_embedding = clip_model.get_image_features(**inputs).squeeze().cpu().numpy()

    return image_embedding

### Final Embedding Class


In [16]:
class EmbeddingClass(Embeddings):
    def embed_query(self, text, image_path = None):
        image_embedding = get_image_embedding(image_path) if image_path else np.zeros(512)
        text_embedding = sentence_model.encode(text, convert_to_tensor=True).cpu().numpy()

        image_embedding = normalize(image_embedding.reshape(1, -1))[0]
        text_embedding = normalize(text_embedding.reshape(1, -1))[0]


        # caption = generate_caption(image_path) if image_path else None

        # if caption:
        #     caption_embedding = sentence_model.encode(caption, convert_to_tensor=True).cpu().numpy()
        # else:
        #     caption_embedding = np.zeros(384)

        combined_embedding = np.concatenate((text_embedding, image_embedding))
        return combined_embedding



    def embed_documents(self, texts, image_paths=None):
        if image_paths is None:
            image_paths = [None] * len(texts)

        return [
            self.embed_query(text, image_path)
            for text, image_path in zip(texts, image_paths)
        ]

combined_embedder = EmbeddingClass()

### TEXT EMBEDDINGS
from the scarped data , i wanna extract meaningful answers , so i will clean up the data , make a format , and then embed it


In [17]:
overall_embeddings = []
metadata = []
ids = []
texts = []

for i, product in enumerate(products):
    image_path = product['image_path']
    specifications = "\n".join(product['specifications'])
    questions = "\n".join(product['qna'])
    reviews = "\n".join(product['reviews'])

    text = "\n\n".join([
        f"Category: {product['category']}",
        f"Name: {product['name']}",
        specifications,
        questions,
        reviews
    ])


    embedding = combined_embedder.embed_query(text, image_path=image_path)

    overall_embeddings.append(embedding)
    metadata.append({
        'rating': product.get('rating'),
        'price': product.get('price'),
        'name': product['name'],
        'category': product.get('category'),
    })
    ids.append(str(i))
    texts.append(text)

    print(f"Processed product {i + 1}/{len(products)}: {product['name']}")

Processed product 1/240: Apple iPhone 8 (Gold, 64 GB)
Processed product 2/240: Apple iPhone 13 ((PRODUCT)RED, 128 GB)
Processed product 3/240: Apple iPhone XR (Yellow, 128 GB) (Includes EarPods, Power Adapter)
Processed product 4/240: Apple iPhone 7 Plus (Red, 128 GB)
Processed product 5/240: Apple iPhone 8 (Silver, 64 GB)
Processed product 6/240: Apple iPhone 5C (Yellow, 8 GB)
Processed product 7/240: Apple iPhone 11 (Black, 64 GB)
Processed product 8/240: Apple iPhone 7 Plus (Black, 128 GB)
Processed product 9/240: Apple iPhone 5C (Green, 8 GB)
Processed product 10/240: Apple iPhone 5C (Pink, 8 GB)
Processed product 11/240: Apple iPhone 11 (Green, 64 GB)
Processed product 12/240: Apple iPhone 6 (Silver, 16 GB)
Processed product 13/240: Apple iPhone 12 (Purple, 64 GB)
Processed product 14/240: Apple iPhone 6s Plus (Gold, 16 GB)
Processed product 15/240: Apple iPhone SE (Red, 128 GB) (Includes EarPods, Power Adapter)
Processed product 16/240: Apple iPhone 6 (Space Grey, 64 GB)
Processe

In [18]:

import shutil
shutil.rmtree("./chroma_db", ignore_errors=True)
vectordb = Chroma(
    collection_name="productsdb",
    embedding_function=combined_embedder,
    persist_directory="./chroma_db",
    client_settings=Settings(allow_reset=True)
)


  vectordb = Chroma(


In [19]:

print(len(ids))
print(len(overall_embeddings))
print(len(metadata))
print(len(texts))

vectordb._collection.upsert(
    ids=ids,
    embeddings=overall_embeddings,
    metadatas=metadata,
    documents=texts,

)
vectordb.persist()


240
240
240
240


  vectordb.persist()


In [42]:
# from langchain_core.prompts import ChatPromptTemplate

# class RetrievalChain:
#     def __init__(self, llm, embedding_class, pre_fetched_docs=None):
#         self.llm = llm
#         self.embedder = embedding_class
#         self.pre_fetched_docs = pre_fetched_docs  

#         self.prompt = ChatPromptTemplate.from_template(
#             """You are an expert product assistant helping users with questions about a smartphone. 
#             You have to follow these rules:
#             - Use the provided product data, customer Q&A, and reviews ONLY to answer the user question accurately.
#             - Do NOT add information not present in the data.
#             - Provide clear, concise, and helpful answers.

#             Product Information available on online sources:
#             {context}

#             User Question:
#             {query}

#             Answer:"""
#         )

#     # def __call__(self, query, image_path=None):
        
#     #     # if self.pre_fetched_docs:
#     #     #     retrieved_docs = "\n\n".join(doc.page_content for doc in self.pre_fetched_docs)
#     #     #     sources = {"documents": [self.pre_fetched_docs]}  # mimic the return structure
#     #     # else:
#     #         sources = self._get_sources(query, image_path)
#     #         retrieved_docs = "\n\n".join(sources['documents'][0])

#     #     final_query = self.prompt.format_messages(
#     #         context=retrieved_docs,
#     #         query=query
#     #     )

#         # response = self.llm.invoke(final_query)



#         # return {
#         #     "answer": response.content if hasattr(response, 'content') else response,
#         #     "sources": sources
#         # }

    

#     def _get_sources(self, query, image_path):
#         query_embedding = self.embedder.embed_query(query, image_path=image_path)

#         results = vectordb._collection.query(
#             query_embeddings=[query_embedding],
#             n_results=5,
#             include=["documents", "metadatas"]
#         )

#         return results


class RetrievalChain:
    def __init__(self, llm, embedding_class):
        self.llm = llm
        self.embedder = embedding_class

        self.prompt = """
        You are an expert product assistant helping users with questions about a smartphone. You have to follow these rules:
        - Use the provided product data, customer Q&A, and reviews ONLY to answer the user question accurately.
        - Do NOT add information not present in the data.
        - Provide clear, concise, and helpful answers.

        Product Information available on online sources:
        {context}

        User Question:
        {query}
        """

    def __call__(self, query, image_path=None, top_docs=None):
        if top_docs is not None:
            # Use provided docs from ensemble rerank
            retrieved_docs = "\n\n".join(doc.page_content for doc in top_docs)
            sources = {"documents": [[doc.page_content for doc in top_docs]],
                       "metadatas": [[doc.metadata for doc in top_docs]]}
        else:
            # Fallback: do fresh retrieval
            sources = self._get_sources(query, image_path)
            retrieved_docs = "\n\n".join(sources['documents'][0])

        final_query = self.prompt.format(context=retrieved_docs, query=query)
        response = self.llm.invoke(final_query)

        return {
            "answer": response.content if hasattr(response, 'content') else response,
            "sources": sources
        }

    def _get_sources(self, query, image_path=None):
        query_embedding = self.embedder.embed_query(query, image_path=image_path)

        results = vectordb._collection.query(
            query_embeddings=[query_embedding],
            n_results=5,
            include=["documents", "metadatas"]
        )
        if results is None or "documents" not in results or not results["documents"]:
            print("Warning: No documents found in vector search.")
            return {"documents": [[]], "metadatas": [[]]}

        return results

    def get_vector_documents(self, query, image_path=None):
        """Returns vector-retrieved docs as a list of Document objects."""
        results = self._get_sources(query, image_path)
        docs = [
            Document(page_content=doc, metadata=meta)
            for doc, meta in zip(results['documents'][0], results['metadatas'][0])
        ]
        return docs

        


In [95]:
class Pipeline:
  def __init__(self):
    self.llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=api_key)
    self.embedder = combined_embedder
    self.chain = RetrievalChain(self.llm, self.embedder)
    

  def __call__(self, query, image_path=None):
    res = self.chain(query, image_path)
    return res

In [69]:
pipeline = Pipeline()

In [74]:

query= "Give details of the phone provided in the image . How is it different form iphone 7?"
img_path = 'flipkart_images/Apple iPhone 8 Plus Space Grey 64 GB.jpg'

In [71]:
result = pipeline(query, img_path)

In [72]:
print(result['answer'])

The phone is an Apple iPhone 8 with 64 GB ROM and a 4.7 inch Retina HD Display. It has a 12MP Rear Camera and a 7MP Front Camera. It runs on an A11 Bionic Chip with 64-bit Architecture, Neural Engine, Embedded M11 Motion Coprocessor and is iOS 13 Compatible. It has a brand warranty of 1 year.

In terms of differences from the iPhone 7, one answer states the major difference is the price and on a serious note - nothing much.


In [45]:
# result['sources']

{'documents': [[Document(metadata={}, page_content="Category: Smartphone\n\nName: Apple iPhone 7 (Silver, 256 GB)\n\n256 GB ROM11.94 cm (4.7 inch) Retina HD Display12MP Rear Camera | 7MP Front CameraA10 Fusion Chip with 64-bit Architecture and Embedded M10 Motion Co-processorBrand Warranty of 1 Year\n\nQ:how much time we play pubg on iPhone 7?A:Mobile battery is very very bad, discharges soonAnonymousCertified Buyer12923Report AbuseRead other answers\nQ:Is iPhone 7 256 GB available in jet black?A:Yes. Atleast when i bought it, it was available in jet black.Flipkart CustomerCertified Buyer438127Report Abuse\nQ:does it supports 3d touch ?A:Yes, iPhone 7 support 3d touch functionality.AnonymousCertified Buyer9326Report AbuseRead other answers\nQ:which is best iphone7 Plus  or Iphone 7..suggest me which is better to buy in 2018???A:iPhone 7AnonymousCertified Buyer27084Report Abuse\nQ:Does this support VOLTE of JIOA:yes.. without any hasslesAnonymousCertified Buyer489166Report Abuse\nQ:what

In [21]:
%pip install rank_bm25
from langchain.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(documents=flipkart_docs_processed,k=5)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [22]:
similarity_retriever = vectordb.as_retriever(search_type="similarity",search_kwargs={"k": 5})

In [23]:
from langchain.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(documents=flipkart_docs_processed,k=5)

In [24]:
from langchain.retrievers import EnsembleRetriever
# reciprocal rank fusion
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, similarity_retriever],weights=[0.5, 0.5]
)

In [25]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever

# download an open-source reranker model - BAAI/bge-reranker-v2-m3
reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
reranker_compressor = CrossEncoderReranker(model=reranker, top_n=5)
# Retriever 2 - Uses a Reranker model to rerank retrieval results from the previous retriever
final_retriever = ContextualCompressionRetriever(
    base_compressor=reranker_compressor,
    base_retriever=ensemble_retriever
)

In [27]:
def ensemble_docs(vector_docs, hybrid_docs):
    doc_map = {}
    for doc in vector_docs:
        key = doc.page_content
        doc_map[key] = {"doc": doc, "from_vector": True, "from_hybrid": False}

    for doc in hybrid_docs:
        key = doc.page_content
        if key in doc_map:
            doc_map[key]["from_hybrid"] = True
        else:
            doc_map[key] = {"doc": doc, "from_vector": False, "from_hybrid": True}

    return list(doc_map.values())


In [28]:
from sentence_transformers import CrossEncoder


cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


def rerank_docs(ensemble, query, crossencoder, boost=0.15, top_k=5):
    reranked = []

    for entry in ensemble:
        doc = entry["doc"]
        score = crossencoder.predict([(query, doc.page_content)])

        if entry.get("from_vector") and entry.get("from_hybrid"):
            score += boost

        reranked.append((doc, score))


    reranked.sort(key=lambda x: x[1], reverse=True)

  
    top_docs = [doc for doc, _ in reranked[:top_k]]
    return top_docs


In [43]:
class Pipeline:
  def __init__(self,final_retriever,crossencoder_model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
    self.llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=api_key)
    self.embedder = combined_embedder
    self.chain = RetrievalChain(self.llm, self.embedder)
    self.crossencoder = CrossEncoder(crossencoder_model_name)
    self.final_retriever = final_retriever

    

  def __call__(self, query, image_path=None,top_k=5):
    vector_docs = self.chain.get_vector_documents(query, image_path)
    hybrid_docs = self.final_retriever.get_relevant_documents(query)
    ensemble = ensemble_docs(vector_docs, hybrid_docs)
    top_docs = rerank_docs(ensemble, query, self.crossencoder, top_k=top_k)
    res = self.chain(query, image_path,top_docs)

    return res

In [44]:
pipeline = Pipeline(final_retriever=final_retriever)


In [40]:

query= "Give details of the phone provided in the image . How is it different form iphone 7?"
img_path = 'flipkart_images/Apple iPhone 8 Plus Space Grey 64 GB.jpg'

In [45]:
result = pipeline(query, img_path)

  hybrid_docs = self.final_retriever.get_relevant_documents(query)


In [46]:
print(result['answer'])

I am sorry, but there was no image provided in the context. However, I can provide details for the following phones: Apple iPhone 7, Apple iPhone 8 and Apple iPhone 5C

Apple iPhone 7 (Silver, 256 GB):
- 256 GB ROM
- 11.94 cm (4.7 inch) Retina HD Display
- 12MP Rear Camera | 7MP Front Camera
- A10 Fusion Chip with 64-bit Architecture and Embedded M10 Motion Co-processor
- Brand Warranty of 1 Year

**Here are the differences between the iPhone 7 and iPhone 8 based on the provided data:**

Apple iPhone 8:
- A11 Bionic Chip with 64-bit Architecture, Neural Engine, Embedded M11 Motion Coprocessor
- Supports wireless charging
- iOS 13 Compatible

**Here are the details for the Iphone 5C**
- 8 GB ROM
- 10.16 cm (4 inch) Retina Display
- 8MP Rear Camera | 1.2MP Front Camera
- A6 Chip Processor
- 1 Year Manufacturer Warranty
- LTE
