### Imports

In [18]:
import os
import numpy as np
import json
from PIL import Image
from sklearn.preprocessing import normalize

In [19]:
import torch
import torchvision.transforms as transforms
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer

In [20]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.embeddings.base import Embeddings
from transformers import CLIPProcessor, CLIPModel
from chromadb.config import Settings
from langchain_google_genai import ChatGoogleGenerativeAI

### Global Vars

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
api_key="AIzaSyA2Wybrg_FPSdEJwMVapGDT-8T5i2nsXi4"

In [23]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [24]:
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

In [25]:
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

### Loading Data

In [26]:
json_path = 'flipkart_iphones_with_reviews_qna2.json'
with open(json_path, 'r', encoding='utf-8') as f:
    products = json.load(f)

### Functions

In [27]:
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = blip_model.generate(**inputs)
        caption = blip_processor.decode(output[0], skip_special_tokens=True)

    return caption

In [28]:
def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        image_embedding = clip_model.get_image_features(**inputs).squeeze().cpu().numpy()

    return image_embedding

### Final Embedding Class


In [29]:
class EmbeddingClass(Embeddings):
    def embed_query(self, text, image_path = None):
        image_embedding = get_image_embedding(image_path) if image_path else np.zeros(512)
        text_embedding = sentence_model.encode(text, convert_to_tensor=True).cpu().numpy()

        image_embedding = normalize(image_embedding.reshape(1, -1))[0]
        text_embedding = normalize(text_embedding.reshape(1, -1))[0]


        # caption = generate_caption(image_path) if image_path else None

        # if caption:
        #     caption_embedding = sentence_model.encode(caption, convert_to_tensor=True).cpu().numpy()
        # else:
        #     caption_embedding = np.zeros(384)

        combined_embedding = np.concatenate((text_embedding, image_embedding))
        return combined_embedding



    def embed_documents(self, texts, image_paths=None):
        if image_paths is None:
            image_paths = [None] * len(texts)

        return [
            self.embed_query(text, image_path)
            for text, image_path in zip(texts, image_paths)
        ]

combined_embedder = EmbeddingClass()

### TEXT EMBEDDINGS
from the scarped data , i wanna extract meaningful answers , so i will clean up the data , make a format , and then embed it


In [30]:
overall_embeddings = []
metadata = []
ids = []
texts = []

for i, product in enumerate(products):
    image_path = product['image_path']
    specifications = "\n".join(product['specifications'])
    questions = "\n".join(product['qna'])
    reviews = "\n".join(product['reviews'])

    text = "\n\n".join([
        f"Category: {product['category']}",
        f"Name: {product['name']}",
        specifications,
        questions,
        reviews
    ])


    embedding = combined_embedder.embed_query(text, image_path=image_path)

    overall_embeddings.append(embedding)
    metadata.append({
        'rating': product.get('rating'),
        'price': product.get('price'),
        'name': product['name'],
        'category': product.get('category'),
    })
    ids.append(str(i))
    texts.append(text)

    print(f"Processed product {i + 1}/{len(products)}: {product['name']}")

Processed product 1/240: Apple iPhone 8 (Gold, 64 GB)
Processed product 2/240: Apple iPhone 13 ((PRODUCT)RED, 128 GB)
Processed product 3/240: Apple iPhone XR (Yellow, 128 GB) (Includes EarPods, Power Adapter)
Processed product 4/240: Apple iPhone 7 Plus (Red, 128 GB)
Processed product 5/240: Apple iPhone 8 (Silver, 64 GB)
Processed product 6/240: Apple iPhone 5C (Yellow, 8 GB)
Processed product 7/240: Apple iPhone 11 (Black, 64 GB)
Processed product 8/240: Apple iPhone 7 Plus (Black, 128 GB)
Processed product 9/240: Apple iPhone 5C (Green, 8 GB)
Processed product 10/240: Apple iPhone 5C (Pink, 8 GB)
Processed product 11/240: Apple iPhone 11 (Green, 64 GB)
Processed product 12/240: Apple iPhone 6 (Silver, 16 GB)
Processed product 13/240: Apple iPhone 12 (Purple, 64 GB)
Processed product 14/240: Apple iPhone 6s Plus (Gold, 16 GB)
Processed product 15/240: Apple iPhone SE (Red, 128 GB) (Includes EarPods, Power Adapter)
Processed product 16/240: Apple iPhone 6 (Space Grey, 64 GB)
Processe

In [31]:

import shutil
shutil.rmtree("./chroma_db", ignore_errors=True)
vectordb = Chroma(
    collection_name="productsdb",
    embedding_function=combined_embedder,
    persist_directory="./chroma_db",
    client_settings=Settings(allow_reset=True)
)


In [32]:

print(len(ids))
print(len(overall_embeddings))
print(len(metadata))
print(len(texts))

vectordb._collection.upsert(
    ids=ids,
    embeddings=overall_embeddings,
    metadatas=metadata,
    documents=texts,

)
vectordb.persist()


240
240
240
240


In [33]:
class RetrievalChain:
  def __init__(self, llm, embedding_class):
    self.llm = llm
    self.embedder = embedding_class

    self.prompt = """
    You are an expert product assistant helping users with questions about a smartphone. You have to follow these rules:
    - Use the provided product data, customer Q&A, and reviews ONLY to answer the user question accurately.
    - Do NOT add information not present in the data.
    - Provide clear, concise, and helpful answers.

    Product Information available on online sources:
    {context}

    User Question:
    {query}
    """

  def __call__(self, query, image_path=None):
    sources = self._get_sources(query, image_path)
    retrieved_docs = "\n\n".join(sources['documents'][0])

    final_query = self.prompt.format(context = retrieved_docs, query = query)

    response = self.llm.invoke(final_query)

    return {
            "answer": response.content if hasattr(response, 'content') else response,
            "sources": sources
        }


  def _get_sources(self, query, image_path):
    query_embedding = self.embedder.embed_query(query, image_path=image_path)

    results = vectordb._collection.query(
        query_embeddings=[query_embedding],
        n_results=5,
        include=["documents", "metadatas"]
    )

    return results

In [34]:
class Pipeline:
  def __init__(self):
    self.llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=api_key)
    self.embedder = combined_embedder
    self.chain = RetrievalChain(self.llm, self.embedder)

  def __call__(self, query, image_path=None):
    res = self.chain(query, image_path)
    return res

In [35]:
pipeline = Pipeline()

In [51]:
query= "Give details of the phone provided in the image . How is it different form iphone 7?"
img_path = 'flipkart_images/Apple iPhone 8 Plus Space Grey 64 GB.jpg'

In [52]:
result = pipeline(query, img_path)

In [53]:
print(result['answer'])

The provided phone is the Apple iPhone 8. Here are its specifications:

*   64 GB ROM
*   11.94 cm (4.7 inch) Retina HD Display
*   12MP Rear Camera | 7MP Front Camera
*   A11 Bionic Chip with 64-bit Architecture, Neural Engine, Embedded M11 Motion Coprocessor
*   iOS 13 Compatible
*   Brand Warranty of 1 Year

As for the difference between iPhone 8 and iPhone 7, one buyer stated that there is not much difference between the two.


In [56]:
result['sources']

{'ids': [['27', '165', '4', '157', '173']],
 'embeddings': None,
 'documents': [['Category: Smartphone\n\nName: Apple iPhone 8 (Space Grey, 64 GB)\n\n64 GB ROM11.94 cm (4.7 inch) Retina HD Display12MP Rear Camera | 7MP Front CameraA11 Bionic Chip with 64-bit Architecture, Neural Engine, Embedded M11 Motion Coprocessor ProcessoriOS 13 CompatibleBrand Warranty of 1 Year\n\nQ:pubg support batteryA:noAmanCertified Buyer15086Report AbuseRead other answers\nQ:major difference with iphone 7???A:Price :)\nOn a serious note - nothing much.Charuchandra CCertified Buyer1499540Report Abuse\nQ:Which one is better I7 or I8?A:Dear brother,  I feel happy and proud to own I 8 . It’s really good ,. Worth buyAnonymousCertified Buyer378141Report Abuse\nQ:does it has better camera than mi a1A:no doubt about it. iPhone camera is far better.Vineet DavidCertified Buyer6819Report Abuse\nQ:iphone 8 has fingerprint sensor?A:Yes but you should have fingers to use this featureAnonymousCertified Buyer9631Report Abu