In [21]:
import pandas as pd
import re
import jax.numpy as jnp
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, Batch
from sentence_transformers import SentenceTransformer

In [6]:
client = QdrantClient(host="localhost", port=6333)

In [7]:
client.create_collection(
    collection_name="products",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE)
)

True

In [22]:
model = SentenceTransformer("BAAI/bge-m3")

In [10]:
df_products = pd.read_csv('./Data/products.csv')

In [23]:
# clean the data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\s+',' ',text)
    return text

df_products['medicine_name'] = df_products['name'].apply(lambda x: clean_text(x))

# define the get embeddings function
def get_embeddings(text):
    return model.encode([text])[0]

# get the embeddings for each product
embds = df_products['medicine_name'].apply(get_embeddings)

In [24]:
embds

0      [-0.02292479, 0.006841396, -0.042289488, 0.012...
1      [-0.033642367, 0.031647194, 0.007422479, -0.02...
2      [-0.060467493, 0.020791654, -0.014955107, -0.0...
3      [-0.037554678, 0.039451566, -0.007978853, -0.0...
4      [-0.0017654726, 0.013356354, -0.05792617, -0.0...
                             ...                        
595    [0.011444231, 0.006238225, -0.03068991, -0.044...
596    [-0.014574654, -0.020354418, -0.022994386, -0....
597    [-0.0105250385, 0.042243183, -0.027250353, -0....
598    [-0.017817512, 0.0072220247, -0.03571063, -0.0...
599    [0.011188283, 0.026707157, -0.06812924, 0.0271...
Name: medicine_name, Length: 600, dtype: object

In [25]:
list_dicts = df_products.to_dict(orient='records')

In [26]:
client.upsert(
    collection_name="products",
    points=Batch(
        ids=list(range(len(df_products))),
        vectors=[emb for emb in embds],
        payloads=df_products.to_dict(orient='records')
    )
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [182]:
collection.add(
    embeddings=[emb for emb in df_pro,
    metadatas=[{"price": price} for price in df_products['price']],
    documents=[doc for doc in df_products['cleaned_name']],
    ids=[str(i) for i in range(len(df_products))]
)

In [163]:
df_products.to_csv("cleaned_products_data.csv", index=False)