In [1]:
#Import milvus
from pymilvus import FieldSchema, CollectionSchema, DataType, MilvusClient

collectionName = "ProductData"
indexFieldName = "embedding"
DIMENSIONS = 384

In [2]:
# Get os and env
from dotenv import load_dotenv,  find_dotenv
load_dotenv(find_dotenv('.env.local'))
import os

In [3]:
# Init and Connect to Client
ENDPOINT = os.getenv("MILVUS_ENDPOINT")
TOKEN = os.getenv("MILVUS_APIKEY")

client = MilvusClient(uri=ENDPOINT, token = TOKEN)

In [4]:
# Create Schema
schema = MilvusClient.create_schema(
	auto_id = True,
 	enable_dynamic_field = True,
  	description="Schema For Product Data"
)

schema.add_field(field_name="id", datatype= DataType.INT64, is_primary = True, description = "ID")
schema.add_field(field_name="embedding", datatype= DataType.FLOAT_VECTOR, dim = DIMENSIONS, description = "Embeddings")
schema.add_field(field_name="category", datatype= DataType.VARCHAR, max_length = 500, description = "Product Category")
schema.add_field(field_name="name", datatype= DataType.VARCHAR, max_length = 500, description = "Product Name")
schema.add_field(field_name="description", datatype= DataType.VARCHAR, max_length = 65535, description = "Product Description")

{'auto_id': True, 'description': 'Schema For Product Data', 'fields': [{'name': 'id', 'description': 'ID', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'embedding', 'description': 'Embeddings', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'category', 'description': 'Product Category', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 500}}, {'name': 'name', 'description': 'Product Name', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 500}}, {'name': 'description', 'description': 'Product Description', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}], 'enable_dynamic_field': True}

In [5]:
# Create Index
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = indexFieldName,
	index_type = "IVF_FLAT",
 	metric_type = "L2",
 	params = {"nlist": 40},
)

In [6]:
# '''Used to Clear all data from collection'''

# if client.has_collection(collectionName):
#     client.drop_collection(collection_name = collectionName)

In [7]:
#Drop Embedding Index from Collection
def dropClusterIndex():
    if(client.has_collection(collectionName)):
        existingFieldNames = client.list_indexes(collection_name = collectionName)
        client.release_collection(collection_name = collectionName)
        if(indexFieldName in existingFieldNames):
            client.drop_index(collection_name = collectionName, index_name = indexFieldName)

In [8]:
# # Drop the Index
# dropClusterIndex()

#Create Collection
client.create_collection(collection_name = collectionName, schema = schema, index_params = index_params)

In [9]:
#Init Embedding Model
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [10]:
#Check if product already exists in the database
def productExists(category, name):
    client.load_collection(collection_name = collectionName)
    
    queryFilter = f'category == "{category}" && name == "{name}"'
    result = client.query(collection_name = collectionName, filter = queryFilter, output_fields = ["id"])
    
    return len(result) > 0

In [11]:
#Check if data is valid
def validateProdData(productData):
    requiredField = ["category", "name", "description"]
    
    for field in requiredField:
        if field not in productData:
            raise Exception(f'Missing Field in Prof Data "{field}"')

In [12]:
import pandas

file = pandas.read_excel("../products.xlsx", engine = "openpyxl")

products = []

for index, row in file.iterrows():
    product = {
		"category": row["Category"],
		"name": row["Name"],
		"description": row["Description"]
	}
    
    products.append(product)

In [13]:
insertProd = []
checkDuplication = set()

#Prepare Insert Array
for productData in products:
    validateProdData(productData)
    
    if(productExists(productData["category"], productData["name"])):
        print("Review Already Exists in Vector Database")
        continue
    
    if(productData["category"], productData["name"]) in checkDuplication:
        print("Duplicate Entry")
        continue
    else:
        checkDuplication.add((productData["category"], productData["name"]))
    
    entry = productData.copy()
    
    productEmbedding = embedder.encode("Description: " + productData["description"] + "\n\n Category: " + productData["category"] + "\n\n Name" + productData["name"])
    entry["embedding"] = productEmbedding
    
    insertProd.append(entry)

In [14]:
#Insert into Database
if(insertProd):
    output = client.insert(collection_name = collectionName, data = insertProd)
    print(output)
else:
    print("Nothing to Add")

{'insert_count': 35, 'ids': [452810269555175944, 452810269555175945, 452810269555175946, 452810269555175947, 452810269555175948, 452810269555175949, 452810269555175950, 452810269555175951, 452810269555175952, 452810269555175953, 452810269555175954, 452810269555175955, 452810269555175956, 452810269555175957, 452810269555175958, 452810269555175959, 452810269555175960, 452810269555175961, 452810269555175962, 452810269555175963, 452810269555175964, 452810269555175965, 452810269555175966, 452810269555175967, 452810269555175968, 452810269555175969, 452810269555175970, 452810269555175971, 452810269555175972, 452810269555175973, 452810269555175974, 452810269555175975, 452810269555175976, 452810269555175977, 452810269555175978], 'cost': 18}


In [15]:
#Close the Client
client.close()