In [1]:
#Import milvus
from pymilvus import FieldSchema, CollectionSchema, DataType, MilvusClient

collectionName = "Accelerators"
indexFieldName = "embedding"
DIMENSIONS = 384

In [2]:
# Get os and env
from dotenv import load_dotenv,  find_dotenv
load_dotenv(find_dotenv('.env.local'))
import os

In [3]:
# Init and Connect to Client
ENDPOINT = os.getenv("MILVUS_ENDPOINT")
TOKEN = os.getenv("MILVUS_APIKEY")

client = MilvusClient(uri = ENDPOINT, token = TOKEN)

In [4]:
# Create Schema
schema = MilvusClient.create_schema(
	auto_id = True,
 	enable_dynamic_field = True,
  	description="Schema For Accelerators Data"
)

schema.add_field(field_name="id", datatype= DataType.INT64, is_primary = True, description = "ID")
schema.add_field(field_name="embedding", datatype= DataType.FLOAT_VECTOR, dim = DIMENSIONS, description = "Embeddings")
schema.add_field(field_name="name", datatype= DataType.VARCHAR, max_length = 500, description = "Accelerators Name")
schema.add_field(field_name= "productName", datatype = DataType.VARCHAR, max_length = 500, description = "Product Name")
schema.add_field(field_name="shortDescription", datatype= DataType.VARCHAR, max_length = 65535, description = "Short Description of Accelerator")
schema.add_field(field_name="type", datatype= DataType.VARCHAR, max_length = 500, description = "Accelerator Type")

{'auto_id': True, 'description': 'Schema For Accelerators Data', 'fields': [{'name': 'id', 'description': 'ID', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'embedding', 'description': 'Embeddings', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'name', 'description': 'Accelerators Name', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 500}}, {'name': 'productName', 'description': 'Product Name', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 500}}, {'name': 'shortDescription', 'description': 'Short Description of Accelerator', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'type', 'description': 'Accelerator Type', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 500}}], 'enable_dynamic_field': True}

In [5]:
# Create Index
index_params = client.prepare_index_params()

index_params.add_index(
    field_name = indexFieldName,
	index_type = "IVF_FLAT",
 	metric_type = "L2",
 	params = {"nlist": 40},
)

In [6]:
# '''Used to Clear all data from collection'''

# if client.has_collection(collectionName):
#     client.drop_collection(collection_name = collectionName)

In [7]:
#Drop Embedding Index from Collection
def dropClusterIndex():
    if(client.has_collection(collectionName)):
        existingFieldNames = client.list_indexes(collection_name = collectionName)
        client.release_collection(collection_name = collectionName)
        if(indexFieldName in existingFieldNames):
            client.drop_index(collection_name = collectionName, index_name = indexFieldName)

In [8]:
# # Drop the Index
# dropClusterIndex()

#Create Collection
client.create_collection(collection_name = collectionName, schema = schema, index_params = index_params)

In [9]:
#Init Embedding Model
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [10]:
#Check if Accelerators already exists in the database
def acceleratorExists(name, productName, type):
    client.load_collection(collection_name = collectionName)
    
    queryFilter = f'name == "{name}" && productName == "{productName}" && type == "{type}"'
    result = client.query(collection_name = collectionName, filter = queryFilter, output_fields = ["id"])
    
    return len(result) > 0

In [11]:
#Check if data is valid
def validateAccelData(acceleratorData):
    requiredField = ["name", "productName", "type",  "shortDescription"]
    
    for field in requiredField:
        if field not in acceleratorData:
            raise Exception(f'Missing Field in Prof Data "{field}"')

In [12]:
import pandas

file = pandas.read_excel("../accelerators.xlsx", engine = "openpyxl")

accelerators = []

for index, row in file.iterrows():
    accelerator = {
		"name": row["Name"],
		"productName": row["Product"],
		"shortDescription": row["Short description"],
		"type": row["Type"]
	}
    
    accelerators.append(accelerator)

  warn("Workbook contains no default style, apply openpyxl's default")


In [13]:
insertAccel = []
checkDuplication = set()

#Prepare Insert Array
for acceleratorData in accelerators:
    validateAccelData(acceleratorData)
    
    if(acceleratorExists(acceleratorData["name"], acceleratorData["productName"], acceleratorData["type"])):
        print("Review Already Exists in Vector Database")
        continue
    
    if(acceleratorData["name"], acceleratorData["productName"], acceleratorData["type"]) in checkDuplication:
        print("Duplicate Entry")
        continue
    else:
        checkDuplication.add((acceleratorData["name"], acceleratorData["productName"], acceleratorData["type"]))
    
    entry = acceleratorData.copy()
    
    productEmbedding = embedder.encode("Product Name: " + acceleratorData["productName"] + "\n\n Type" + acceleratorData["type"])
    entry["embedding"] = productEmbedding
    
    insertAccel.append(entry)

In [14]:
#Insert into Database
if(insertAccel):
    output = client.insert(collection_name = collectionName, data = insertAccel)
    print(output)
else:
    print("Nothing to Add")

{'insert_count': 70, 'ids': [452810269558795916, 452810269558795917, 452810269558795918, 452810269558795919, 452810269558795920, 452810269558795921, 452810269558795922, 452810269558795923, 452810269558795924, 452810269558795925, 452810269558795926, 452810269558795927, 452810269558795928, 452810269558795929, 452810269558795930, 452810269558795931, 452810269558795932, 452810269558795933, 452810269558795934, 452810269558795935, 452810269558795936, 452810269558795937, 452810269558795938, 452810269558795939, 452810269558795940, 452810269558795941, 452810269558795942, 452810269558795943, 452810269558795944, 452810269558795945, 452810269558795946, 452810269558795947, 452810269558795948, 452810269558795949, 452810269558795950, 452810269558795951, 452810269558795952, 452810269558795953, 452810269558795954, 452810269558795955, 452810269558795956, 452810269558795957, 452810269558795958, 452810269558795959, 452810269558795960, 452810269558795961, 452810269558795962, 452810269558795963, 45281026955

In [15]:
#Close the Client
client.close()