init

In [1]:
import json
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

with open('../../config.json') as f:
    config = json.load(f)["vecdb"]

index_param = {
    "index_type": config["index_type"],
    "params": config["index_params"],
    "metric_type": config["metric_type"]
    }

# start vector db service
print("Starting Vector DB service...")
connection_retries = config["connection_retries"]
for i in range(connection_retries+1):
    try:
        connections.connect(host=config["host"], port=config["port"])
        print("- Connected to:", connections.list_connections())
        break
    except:
        if i == connection_retries:
            raise Exception(f"! Error: cannot connect vector db service with {connection_retries} retries!")
        else:
            print("- Warning: Failed to connect vector db service, retrying...")

Starting Vector DB service...
- Connected to: [('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x7f7f6c5bc9d0>)]


create a new collection

In [3]:
new_collection_name = "all_data_with_copyright"

if utility.has_collection(new_collection_name):
    collection = Collection(new_collection_name)
else:
    id_field = FieldSchema(name='id', dtype=DataType.INT64, description="id_int64", is_primary=True)
    embedding_field = FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description="embedding_floatvector", dim=config["dimension"], is_primary=False)
    month_field = FieldSchema(name='month', dtype=DataType.INT64, description="month_int64", is_primary=False)
    company_field = FieldSchema(name='company', dtype=DataType.INT64, description="company_int64", is_primary=False)
    datatype_field = FieldSchema(name='datatype', dtype=DataType.INT64, description="datatype_int64", is_primary=False)
    copyright_field = FieldSchema(name='copyright', dtype=DataType.INT64, description="copyright_int64", is_primary=False)
    schema = CollectionSchema(fields=[id_field, embedding_field, month_field, company_field, datatype_field, copyright_field], description="storing all data")
    collection = Collection(name=new_collection_name, data=None, schema=schema, properties={"collection.ttl.seconds": 2**31-1})

print("- Current collections:", utility.list_collections())

- Current collections: ['all_data', 'all_data_with_copyright']


In [4]:
if not collection.has_index():
    collection.create_index(field_name="embedding", index_params=index_param)
collection.load()
print("- Loaded collection, num of vector:", collection.num_entities)

- Loaded collection, num of vector: 0
