In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import  Distance, VectorParams, PointStruct

import pandas as pd
import openai

  from .autonotebook import tqdm as notebook_tqdm


## Read the sampled dataset with Amazon inventory data

In [2]:
df_items= pd.read_json('../../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl', lines=True)

In [3]:
len(df_items)

1000

In [None]:
df_items.head()

In [None]:
list(df_items["features"].items())[:2]

In [None]:
list(df_items["images"].items())[0]

### Preprocess title and features

In [None]:
def make_description(row):
    return f"{row['title']}{" ". join(row['features'])}"

In [None]:
def extract_large_image(row):
    return row['images'][0].get('large', '')

In [None]:
df_items['description'] = df_items.apply(make_description, axis=1)
df_items['image'] = df_items.apply(extract_large_image, axis=1)

In [None]:
df_items.head()


In [None]:
list(df_items["description"].items())[:2]

### Get 50 items Sample

In [None]:
df_sample = df_items.sample(50, random_state=42)

In [None]:
len(df_sample)

In [None]:
data_to_embed= df_sample[['description', 'image', 'rating_number', 'price', 'average_rating', 'parent_asin']]

In [None]:
data_to_embed.head()

In [None]:
data_to_embed = data_to_embed.to_dict(orient='records')

In [None]:
data_to_embed

### Embedding function

In [None]:
def get_embedding(text:str):
    response =openai.embeddings.create(
        input=text,
        model='text-embedding-3-small'
    )
    return response.data[0].embedding

In [None]:
get_embedding('Hi')

### Create Qdrant collection

In [None]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [None]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-00",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

### Embed data

Test

In [None]:
pointstruct = PointStruct(
    id=0,
    vector= get_embedding("Test text"),
    payload = {
        "text": "Test text",
        "model": "text-embedding-3-small",
    }
)

In [None]:
pointstruct.payload

### Amazon data

In [None]:
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data['description'])
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload=data,
        )
    )

In [None]:
len(pointstructs)

### Write embedded data to Qdrant

In [None]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-00",
    wait=True,
    points=pointstructs
)

### Retreive data from Qdrant

In [None]:
def retreive_data(query: str, k=5):
    query_embedding = get_embedding(query)
    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-00",
        query=query_embedding,
        limit=k
    )

    return results

Test

In [None]:
retreive_data("what kind of charging cords do you offer").points