### Import Dependencies

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, SparseVectorParams, Document, Prefetch, FusionQuery
from qdrant_client import models

import pandas as pd
import openai
import fastembed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qdrant_client = QdrantClient(url="http://localhost:6333")

### Create Qdrant collection for hybrid search

In [43]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-01-hybrid-search-v2",
    vectors_config={
        "text-embedding-3-small": VectorParams(size=1536, distance=Distance.COSINE)
    },
    sparse_vectors_config={"bm25": SparseVectorParams(modifier=models.Modifier.IDF)},
)

True

In [44]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-hybrid-search-v2",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD,
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

### Embedding Functions

In [23]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [24]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]

    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i : i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1

    return all_embeddings

### Process and Embed Amazon Items Data

In [6]:
df_items = pd.read_json("../../data/meta_Electronics_2022_2023_with_category_rating_100_sample_1000.jsonl", lines=True)

In [7]:
df_items.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Computers,ASUS Vivobook Go 12 L210 11.6‚Äù Ultra-Thin Lapt...,3.9,444,[Efficient Intel Celeron N4020 Processor 1.1 G...,[The ASUS Laptop L210MA is designed to help yo...,185.0,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'ASUS Vivobook Go 11.6"" - Full Revi...",ASUS,"[Electronics, Computers & Accessories, Compute...",{'Standing screen display size': '11.6 Inches'...,B0B9JJ1D8Y,,,
1,Computers,G.Skill Trident Z5 NEO RGB Series (AMD Expo) 3...,4.8,145,"[Trident Z5 Neo RGB Series, designed for AMD X...",[],104.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Trident Z5 & Trident Z5 RGB Series...,G.Skill,"[Electronics, Computers & Accessories, Compute...","{'RAM': '32 GB DDR5', 'Brand': 'G.Skill', 'Ser...",B0BF6ZQ8MY,,,
2,Computers,CORSAIR VENGEANCE SODIMM DDR5 RAM 8GB (1x8GB) ...,4.5,234,[Upgrade Your DDR5 Gaming or Performance Lapto...,[Upgrade your DDR5 gaming or performance lapto...,36.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Corsair Vengeance DDR5 32GB', 'url...",Corsair,"[Electronics, Computers & Accessories, Compute...","{'RAM': '8 GB DDR5', 'Memory Speed': '4800 MHz...",B09YF1L6Y2,,,
3,All Electronics,UGREEN M.2 NVMe SSD Enclosure 10Gbps USB 3.2 E...,4.5,349,[High-speed Transmission: This NVMe M.2 SSD en...,[],24.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Taking A Look At The ORICO M.2 Dri...,UGREEN,"[Electronics, Computers & Accessories, Compute...",{'Product Dimensions': '4.92 x 1.61 x 0.55 inc...,B0BQ6SYQWL,,,
4,,Slim & Expandable Laptop Backpack 15 15.6 16 I...,4.6,3663,[Slim & Expandable Design: Slim style for ligh...,[],29.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'I got a Grey one A really slim bag...,ZINZ,"[Electronics, Computers & Accessories, Laptop ...","{'Brand': 'ZINZ', 'Item Weight': '1.59 pounds'...",B097CYFH1M,,,


In [8]:
len(df_items)

1000

In [9]:
def preprocess_description(row):
    return f"{row['title']}. {''.join(row['features'])} "

In [10]:
def extract_first_large_image(row):
    return row["images"][0].get("large", "")

In [11]:
df_items["description"] = df_items.apply(preprocess_description, axis=1)
df_items["image"] = df_items.apply(extract_first_large_image, axis=1)

In [12]:
data_to_embed = df_items[
    ["description", "image", "rating_number", "price", "average_rating", "parent_asin"]
].to_dict(orient="records")

In [13]:
data_to_embed

[{'description': "ASUS Vivobook Go 12 L210 11.6‚Äù Ultra-Thin Laptop, 2022 Version, Intel Celeron N4020, 4GB RAM, 64GB eMMC, Win 11 Home in S Mode with One Year of Office 365 Personal, L210MA-DS02. Efficient Intel Celeron N4020 Processor 1.1 GHz (4M Cache, up to 2.8 GHz).Voltage:19.0 volts11.6‚Äù HD (1366 x 768) Slim Display64GB eMMC Flash Storage and 4GB LPDDR4 RAMWindows 11 in S mode with One Year of Office 365 PersonalSlim and Portable: 0.7‚Äù thin and weighs only 2.31 lbs (battery included)USB 3.2 Gen 1 Type-C, USB 3.2 Gen 1 Type-A, HDMI (*USB Transfer speed may vary. Learn more at ASUS website)802.11ac Wi-Fi for speeds up to three times faster than 802.11nWindows 11 in S mode is a 100% app based version of Windows where applications are verified and tested for quality on the Microsoft store. If you want to install an app that isn't available in the Microsoft Store, you'll need to switch out of S mode for free, which is easy and fast. ",
  'image': 'https://m.media-amazon.com/image

In [14]:
text_to_embed = [data['description'] for data in data_to_embed]

In [15]:
text_to_embed

["ASUS Vivobook Go 12 L210 11.6‚Äù Ultra-Thin Laptop, 2022 Version, Intel Celeron N4020, 4GB RAM, 64GB eMMC, Win 11 Home in S Mode with One Year of Office 365 Personal, L210MA-DS02. Efficient Intel Celeron N4020 Processor 1.1 GHz (4M Cache, up to 2.8 GHz).Voltage:19.0 volts11.6‚Äù HD (1366 x 768) Slim Display64GB eMMC Flash Storage and 4GB LPDDR4 RAMWindows 11 in S mode with One Year of Office 365 PersonalSlim and Portable: 0.7‚Äù thin and weighs only 2.31 lbs (battery included)USB 3.2 Gen 1 Type-C, USB 3.2 Gen 1 Type-A, HDMI (*USB Transfer speed may vary. Learn more at ASUS website)802.11ac Wi-Fi for speeds up to three times faster than 802.11nWindows 11 in S mode is a 100% app based version of Windows where applications are verified and tested for quality on the Microsoft store. If you want to install an app that isn't available in the Microsoft Store, you'll need to switch out of S mode for free, which is easy and fast. ",
 'G.Skill Trident Z5 NEO RGB Series (AMD Expo) 32GB (2 x 16G

In [16]:
embeddings = get_embeddings_batch(text_to_embed)

Processed 100 of 1000
Processed 200 of 1000
Processed 300 of 1000
Processed 400 of 1000
Processed 500 of 1000
Processed 600 of 1000
Processed 700 of 1000
Processed 800 of 1000
Processed 900 of 1000
Processed 1000 of 1000


In [17]:
len(embeddings)

1000

In [18]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings, data_to_embed):
    pointstructs.append(
        PointStruct(
            id=i,
            vector={
                "text-embedding-3-small": embedding,
                "bm25": Document(text=data["description"], model="Qdrant/bm25"),
            },
            payload=data,
        )
    )
    i += 1

In [19]:
pointstructs[0].vector

{'text-embedding-3-small': [0.00338378525339067,
  -0.0003883089520968497,
  -0.010100934654474258,
  -0.018039384856820107,
  0.012392273172736168,
  -0.04161383584141731,
  0.019249480217695236,
  0.041882745921611786,
  0.0055070542730391026,
  0.017479155212640762,
  -0.009120533242821693,
  0.006902025546878576,
  -0.07005107402801514,
  0.024022633209824562,
  0.031731389462947845,
  0.009423057548701763,
  0.009288602508604527,
  -0.01957441307604313,
  -0.026196323335170746,
  -0.03858859837055206,
  0.02624114230275154,
  0.005218536127358675,
  -0.003417399013414979,
  0.02545682154595852,
  0.03964182734489441,
  -0.021658465266227722,
  -0.02171448990702629,
  -0.04042614996433258,
  0.010302617214620113,
  -0.05077918618917465,
  0.003338966751471162,
  -0.019686458632349968,
  -0.012918888591229916,
  -0.029423244297504425,
  -0.04194997251033783,
  -0.052840832620859146,
  0.028638923540711403,
  -0.04800045117735863,
  0.0021120645105838776,
  0.009439864195883274,
  0.

In [20]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search-v2",
    points=pointstructs[0:500],
    wait=True
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search-v2",
    points=pointstructs[500:],
    wait=True
)

UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

### Hybrid Retrieval

In [25]:
def retrieve_data(query, qdrant_client, k=5):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-hybrid-search-v2",
        prefetch=[
            Prefetch(
                query=query_embedding,
                using="text-embedding-3-small",
                limit=20
            ),
            Prefetch(
                query=Document(text=query, model="Qdrant/bm25"),
                using="bm25",
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    retrieved_context_ids = []
    retrieved_context= []
    similiarity_scores = []
    retrieved_context_ratings = []

    for result in results.points:
        retrieved_context_ids.append(result.payload['parent_asin'])
        retrieved_context.append(result.payload['description'])
        similiarity_scores.append(result.score)
        retrieved_context_ratings.append(result.payload['average_rating'])

    return {
        "retrieved_context_ids": retrieved_context_ids,
        "retrieved_context": retrieved_context,
        "similiarity_scores": similiarity_scores,
        "retrieved_context_ratings": retrieved_context_ratings
    }

In [28]:
results = retrieve_data("I want to buy a tablet", qdrant_client, k=20)

In [30]:
results

{'retrieved_context_ids': ['B0C697WS8L',
  'B0BBFVQLVQ',
  'B0B2RK8PHB',
  'B07G3WVR5C',
  'B09TPD8R5F',
  'B0CF1B9QTM',
  'B09SW5L2PV',
  'B0B8PBZH45',
  'B0BCDYMF5L',
  'B0CC94G5J9',
  'B0BTHP8WP5',
  'B09TSQXTYZ',
  'B0B4SN9K1T',
  'B0BRM8DC9H',
  'B0BK935JV7',
  'B09TH8S2WT',
  'B09PTSR7TK',
  'B0BCF1BSXK',
  'B0BJSGG7M2',
  'B09QS7Z919'],
 'retrieved_context': ["2023 Newest Android 11.0 Tablet, 2 in 1 Tablet 10 Inch, 5G Dual WIFI Tablet with Keyboard, 64GB ROM + 6GB RAM, 256GB Expand, Quad-Core Processor, 2 Card Slot, 13MP Camera, WIFI/ Bluetooth/OTG - Silver. „ÄêPowerful System„ÄëAOYODKG 2023 android tablet uses Google's certified Android 11.0 operating system and can access Google Play, 10 inch tablet allowing you to download the applications you need. Equipped with quad-core 64-bit Cortex- A55 1.8-2.3Ghz, enables quicker app launch, smoother videos and better overall performance.„ÄêUltra-clean Record„ÄëThe android tablet with keyboard bundle high clear tablet PC come with 13mp 