## Data Ingestion

In [14]:
db_path = "../db/ecommerce_products.db"

In [None]:
import json
def safe_json_loads(value):
    if not value:  
        return None
    try:
        return json.loads(value)
    except json.JSONDecodeError:
        cleaned = value.replace("'", '"')  # لو متخزن single quotes
        try:
            return json.loads(cleaned)
        except Exception:
            return value  


In [18]:
import sqlite3
import json
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
query = """
        SELECT id, product_id, pid, title, brand, category, sub_category,
               description, actual_price, selling_price, discount_percentage,
               average_rating, out_of_stock, seller, url, images, 
               product_details, crawled_at
        FROM products
        """
cursor.execute(query)
rows = cursor.fetchall()
        
products = []
for row in rows:
    product = {
                'id': row['id'],
                'product_id': row['product_id'],
                'pid': row['pid'],
                'title': row['title'],
                'brand': row['brand'],
                'category': row['category'],
                'sub_category': row['sub_category'],
                'description': row['description'],
                'actual_price': row['actual_price'],
                'selling_price': row['selling_price'],
                'discount_percentage': row['discount_percentage'],
                'average_rating': row['average_rating'],
                'out_of_stock': row['out_of_stock'],
                'seller': row['seller'],
                'url': row['url'],
                'images': safe_json_loads(row['images']) if row['images'] else None,
                'product_details': safe_json_loads(row['product_details']) if row['product_details'] else None,
                'crawled_at': row['crawled_at']
            }
    products.append(product)

In [19]:
products[0]

{'id': 1,
 'product_id': 'fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a',
 'pid': 'TKPFCZ9EA7H5FYZH',
 'title': 'Solid Men Multicolor Track Pants',
 'brand': 'York',
 'category': 'Clothing and Accessories',
 'sub_category': 'Bottomwear',
 'description': '69% off',
 'actual_price': 2999,
 'selling_price': 921,
 'discount_percentage': 'Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India',
 'average_rating': 3.9,
 'out_of_stock': 0,
 'seller': 'Shyam Enterprises',
 'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p/itmd2c76aadce459?pid=TKPFCZ9EA7H5FYZH&lid=LSTTKPFCZ9EA7H5FYZHVYXWP0&marketplace=FLIPKART&srno=b_1_1&otracker=browse&fm=organic&iid=177a46eb-d053-4732-b3de-fcad6ff59cbd.TKPFCZ9EA7H5FYZH.SEARCH&ssid=utkd4t3gb40000001612415717799',
 'images': ['https://rukminim1.flixcart.com/image/128/128/jr3t5e80/track-pant/z/y/n/m-1005combo2-york

In [20]:
len(products)

30000

## Data Chunking

In [28]:
def create_page_content(product):
    content = f"Title: {product['title']}\n"
    
    if product['description']:
        content += f"Description: {product['description']}\n"
    
    content += f"Category: {product['category']}\n"
    content += f"Sub Category: {product['sub_category']}\n"
    
    if product['brand']:
        content += f"Brand: {product['brand']}\n"
    
    if product['selling_price']:
        content += f"Price: ₹{product['selling_price']}\n"
    
    if product['average_rating']:
        content += f"Rating: {product['average_rating']}/5\n"
    
    if product['seller']:
        content += f"Seller: {product['seller']}\n"
    
    if product['product_details']:
        content += "Product Details:\n"
        
        # Normalize product_details
        details = product['product_details']
        if isinstance(details, str):
            try:
                details = json.loads(details)  # حاول تفكها لو JSON
            except:
                details = [details]           # لو مجرد نص عادي
        
        if isinstance(details, dict):  # لو dict حوله لقائمة
            details = [details]
        
        if isinstance(details, list):  # لو قائمة dicts
            for detail in details:
                if isinstance(detail, dict):
                    for key, value in detail.items():
                        content += f"{key}: {value}\n"
                else:
                    content += f"- {detail}\n"
    
    return content


In [34]:
def create_metadata(product):
        metadata = {
            'product_id': product['product_id'],
            'pid': product['pid'],
            'category': product['category'],
            'sub_category': product['sub_category'],
            'brand': product['brand'],
            'selling_price': product['selling_price'],
            'average_rating': product['average_rating'],
            'out_of_stock': product['out_of_stock'],
            'seller': product['seller'],
            'images': product['images'],
            'url': product['url']
        }
        return metadata

In [35]:
def chunk_products(products):
        chunks = []
        
        for product in products:
            page_content = create_page_content(product)
            metadata = create_metadata(product)
            
            chunk = {
                'page_content': page_content,
                'metadata': metadata
            }
            chunks.append(chunk)
        
        return chunks

In [36]:
chunks = chunk_products(products)

In [37]:
len(chunks)

30000

In [38]:
chunks[0]

{'page_content': 'Title: Solid Men Multicolor Track Pants\nDescription: 69% off\nCategory: Clothing and Accessories\nSub Category: Bottomwear\nBrand: York\nPrice: ₹921\nRating: 3.9/5\nSeller: Shyam Enterprises\nProduct Details:\nStyle Code: 1005COMBO2\nClosure: Elastic\nPockets: Side Pockets\nFabric: Cotton Blend\nPattern: Solid\nColor: Multicolor\n',
 'metadata': {'product_id': 'fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a',
  'pid': 'TKPFCZ9EA7H5FYZH',
  'category': 'Clothing and Accessories',
  'sub_category': 'Bottomwear',
  'brand': 'York',
  'selling_price': 921,
  'average_rating': 3.9,
  'out_of_stock': 0,
  'seller': 'Shyam Enterprises',
  'images': ['https://rukminim1.flixcart.com/image/128/128/jr3t5e80/track-pant/z/y/n/m-1005combo2-yorker-original-imafczg3xfh5qqd4.jpeg?q=70',
   'https://rukminim1.flixcart.com/image/128/128/jr58l8w0/track-pant/w/d/a/l-1005combo8-yorker-original-imafczg3pgtxgraq.jpeg?q=70'],
  'url': 'https://www.flipkart.com/yorker-solid-men-multicolor-track-pants/p

## Vectore store

In [None]:
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="", 
    api_key="",
)

print(qdrant_client.get_collections())

collections=[]


In [70]:
from qdrant_client.http import models
qdrant_client.create_collection(
    collection_name="ecommerce_agent",
    vectors_config=models.VectorParams(
        size= 384,
        distance=models.Distance.COSINE
    )
)

True

In [71]:
qdrant_client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='ecommerce_agent')])

In [57]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


  embedding_model = HuggingFaceEmbeddings(


In [72]:
embedding_model

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [59]:

print(embedding_model.embed_query("hello world"))


[-0.034477200359106064, 0.031023219227790833, 0.0067349993623793125, 0.02610897459089756, -0.03936201333999634, -0.16030248999595642, 0.06692397594451904, -0.006441470701247454, -0.04745051637291908, 0.014758859761059284, 0.07087533175945282, 0.05552757531404495, 0.019193289801478386, -0.026251329109072685, -0.010109508410096169, -0.026940548792481422, 0.022307483479380608, -0.022226618602871895, -0.14969265460968018, -0.01749304309487343, 0.007676269859075546, 0.05435226485133171, 0.003254482988268137, 0.03172600269317627, -0.0846213772892952, -0.02940598875284195, 0.051595672965049744, 0.04812409356236458, -0.003314794274047017, -0.05827922374010086, 0.041969284415245056, 0.022210709750652313, 0.1281888633966446, -0.022338956594467163, -0.011656241491436958, 0.06292833387851715, -0.032876305282115936, -0.09122610837221146, -0.03117538057267666, 0.05269957706332207, 0.047034792602062225, -0.08420304954051971, -0.0300561785697937, -0.020744718611240387, 0.009517784230411053, -0.0037217

In [73]:
import uuid
from qdrant_client.http import models

collection_name = "ecommerce_agent"
points = []

for i, chunk in enumerate(chunks):
    embedding = embedding_model.embed_query(chunk['page_content'])

    point = models.PointStruct(
        id=str(uuid.uuid4()),
        vector=embedding,
        payload={
            'page_content': chunk['page_content'],
            'metadata': chunk['metadata']
        }
    )
    points.append(point)


In [74]:
len(points)

30000

In [75]:
points[0]

PointStruct(id='c4191a91-0b44-440d-8b63-ba2833b4a182', vector=[-0.032634928822517395, -0.004320177249610424, -0.033641185611486435, 0.02179153636097908, -0.04127698391675949, 0.02397557720541954, 0.08336371928453445, -0.017469165846705437, -0.06932464987039566, -0.001814628136344254, -0.05935072526335716, 0.007426005322486162, -0.008464987389743328, -0.021200666204094887, 0.010349545627832413, 0.033993206918239594, 0.053395189344882965, 0.004772611428052187, -0.026422375813126564, -0.0004505014221649617, 0.026870636269450188, -0.02919613942503929, -0.007550187408924103, -0.003944529686123133, -0.11305499821901321, -0.020931124687194824, 0.018435275182127953, 0.08014246821403503, -0.05535128712654114, -0.04475603625178337, -0.004381145350635052, 0.09799477458000183, 0.0822453424334526, -0.02043868415057659, 0.023853609338402748, -0.12567585706710815, 0.008281808346509933, -0.017353782430291176, -0.039107631891965866, 0.04244593530893326, -0.051334355026483536, -0.03935019671916962, -0.0

In [78]:
batch_size = 100
for i in range(0, len(points), batch_size):
    batch = points[i:i+batch_size]
    qdrant_client.upsert(
        collection_name=collection_name,
        points=batch
    )
    print(f"✅ Inserted batch {i//batch_size + 1}")

✅ Inserted batch 1
✅ Inserted batch 2
✅ Inserted batch 3
✅ Inserted batch 4
✅ Inserted batch 5
✅ Inserted batch 6
✅ Inserted batch 7
✅ Inserted batch 8
✅ Inserted batch 9
✅ Inserted batch 10
✅ Inserted batch 11
✅ Inserted batch 12
✅ Inserted batch 13
✅ Inserted batch 14
✅ Inserted batch 15
✅ Inserted batch 16
✅ Inserted batch 17
✅ Inserted batch 18
✅ Inserted batch 19
✅ Inserted batch 20
✅ Inserted batch 21
✅ Inserted batch 22
✅ Inserted batch 23
✅ Inserted batch 24
✅ Inserted batch 25
✅ Inserted batch 26
✅ Inserted batch 27
✅ Inserted batch 28
✅ Inserted batch 29
✅ Inserted batch 30
✅ Inserted batch 31
✅ Inserted batch 32
✅ Inserted batch 33
✅ Inserted batch 34
✅ Inserted batch 35
✅ Inserted batch 36
✅ Inserted batch 37
✅ Inserted batch 38
✅ Inserted batch 39
✅ Inserted batch 40
✅ Inserted batch 41
✅ Inserted batch 42
✅ Inserted batch 43
✅ Inserted batch 44
✅ Inserted batch 45
✅ Inserted batch 46
✅ Inserted batch 47
✅ Inserted batch 48
✅ Inserted batch 49
✅ Inserted batch 50
✅ Inserte

In [79]:
def search(query, limit=5):
        query_embedding = embedding_model.embed_query(query)
        
        results = qdrant_client.search(
            collection_name=collection_name,
            query_vector=query_embedding,
            limit=limit
        )
        
        return results

In [82]:
query = "is Full Sleeve avilable"
result = search(query)

  results = qdrant_client.search(


In [83]:
result

[ScoredPoint(id='47eb4b99-3bde-4752-85d3-dc06e14990c5', version=279, score=0.5745871, payload={'page_content': 'Title: Full Sleeve Solid Men Sweatshirt\nDescription: 65% off\nCategory: Clothing and Accessories\nSub Category: Winter Wear\nBrand: Pu\nPrice: ₹979\nRating: 3.8/5\nSeller: RetailNet\nProduct Details:\nColor: Blue\nFabric: Cotton Lycra Blend\nPattern: Solid\nNeck: Hooded\nSleeve: Full Sleeve\nStyle Code: 84477102Blue\nOccasion: Sports\nHooded: Yes\nReversible: No\nSuitable For: Western Wear\nFabric Care: Dry in Shade, Do Not Bleach, Do Not Iron on Print/Embroidery/Embellishment, Machine Wash as per Tag\n', 'metadata': {'product_id': '167a0109-db6a-5d41-b99c-0e03342b738a', 'pid': 'SWSF8CJBBMEFJYUD', 'category': 'Clothing and Accessories', 'sub_category': 'Winter Wear', 'brand': 'Pu', 'selling_price': 979, 'average_rating': 3.8, 'out_of_stock': 0, 'seller': 'RetailNet', 'images': ['https://rukminim1.flixcart.com/image/128/128/jq6y0sw0/sweatshirt/s/s/3/m-84477102blue-puma-origin