In [107]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection
import json
import numpy as np
import gensim.downloader as api

# Connect to Milvus server
connections.connect(host='localhost', port='19530')

# Load the Google pretrained word2vec model
model = api.load("word2vec-google-news-300")

# Load the data from the JSON file
json_file_path = "data.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

idNumbers = []
# for all the data create a unique idNumber
for i in range(len(data)):
    idNumbers.append(random.randint(100000, 999999))

    # also update the json data with the idNumber
    data[i]['idNumber'] = idNumbers[i]


print(len(idNumbers), "idNumbers created successfully")

    

# Function to calculate average vector for text
def calculate_average_vector(text):
    tokens = text.split()
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to process data and store embeddings
def process_and_store_embeddings(data):
    embeddings = []
    for value in data:

        title_vector = calculate_average_vector(value.get('title', ''))
        description_vector = calculate_average_vector(value.get('description', ''))

        combined_vector = np.concatenate([title_vector, description_vector])
        embeddings.append(combined_vector)

    return embeddings

# Call the function to process data and store embeddings
embeddings = process_and_store_embeddings(data)

print(len(embeddings), "embeddings created successfully")

# create the fieldSchema for the collection
schema = CollectionSchema(fields=[
    FieldSchema(name="idNumber", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=600)  # Corrected dimension to match embeddings
])

# Create a collection in Milvus
collection_name = "tender_collection"
collection = Collection(
    name=collection_name,
    schema=schema,
    consistency_level="Strong"
)

# Specify the index type and parameters
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
collection.create_index("embedding", index)



# Insert embeddings into the collection
insert_result = collection.insert([
    {"idNumber": idNumbers[i], "embedding": embedding.tolist()} 
    for i, embedding in enumerate(embeddings)
])

# After final entity is inserted, it is best to call flush to have no growing segments left in memory
collection.flush() 

# Disconnect from Milvus server
connections.disconnect(alias='default')
print("Data and embeddings stored successfully in Milvus")


286 idNumbers created successfully
286 embeddings created successfully
Data and embeddings stored successfully in Milvus


In [73]:
from pymilvus import connections, Collection
import json
import numpy as np
import gensim.downloader as api

# Connect to Milvus server
connections.connect(host='localhost', port='19530')


collection = Collection("tender_collection")      # Get an existing collection.
collection.load()
# Function to calculate average vector for text

def calculate_average_vector(text):
    tokens = text.split()
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to process data and store embeddings
def process_and_store_embeddings(data):
    title_vector = calculate_average_vector(data.get('title', ''))
    description_vector = calculate_average_vector(data.get('description', ''))

    combined_vector = np.concatenate([title_vector, description_vector])
    
    return combined_vector


# Load the sample data from a JSON file
json_file_path = "sample.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
    sample_data = json.load(f)

# Process and store the embeddings for the sample data
sample_embedding = process_and_store_embeddings(sample_data)


# Search for similar embeddings

collection_name = "tender_collection"
collection = Collection(collection_name)

collection.load()
vectors_to_search = embedding[-1][-2:]
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = collection.search(vectors_to_search, "embedding", search_params, limit=3, output_fields=["random"])


# Disconnect from Milvus server
connections.disconnect(alias='default')


IndexError: invalid index to scalar variable.

In [108]:
from pymilvus import connections, Collection
import json
import numpy as np
import gensim.downloader as api

# Connect to Milvus server
connections.connect(host='localhost', port='19530')

def calculate_average_vector(text):
    tokens = text.split()
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to process data and store embeddings
def process_and_store_embeddings(data):
    title_vector = calculate_average_vector(data.get('title', ''))
    description_vector = calculate_average_vector(data.get('description', ''))

    combined_vector = np.concatenate([title_vector, description_vector])
    
    return combined_vector

# Load the sample data from a JSON file
json_file_path = "sample.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
    sample_data = json.load(f)

# Process and store the embeddings for the sample data
sample_embedding = process_and_store_embeddings(sample_data)
print("Embeddings created successfully")
# prinnt the sample embedding sepated by a comma
print(", ".join([str(x) for x in sample_embedding]))

# Create a collection object
collection_name = "tender_collection"
collection = Collection(collection_name)
collection.load()

# # Search for similar embeddings
# vectors_to_search = sample_embedding  # Take the last two vectors
# search_params = {
#     "metric_type": "L2",
#     "params": {"nprobe": 10},
# }
# result = collection.search(vectors_to_search, anns_field="embedding", param=search_params, limit=3, output_fields=["idNumber"] )

# # Disconnect from Milvus server
# connections.disconnect(alias='default')


Embeddings created successfully
-0.059636433, -0.051282246, -0.0051472983, -0.008107503, -0.043928783, 0.0063171387, 0.039237976, -0.10059611, -0.057319004, 0.06380717, -0.015401204, 0.10122681, 0.08313751, 0.08879852, -0.083358765, -0.0803833, 0.043182373, 0.12752278, -0.07509359, 0.019269308, 0.048797607, 0.021731058, 0.04751587, 0.09864807, 0.045684814, 0.053029377, -0.15948486, 0.07053121, 0.014996846, -0.082117714, -0.076408386, -0.047785442, -0.014322917, -0.07271322, 0.015187581, -0.066345215, -0.06229655, -0.030568441, -0.10974121, -0.05664571, -0.060236294, -0.05803426, -0.13721721, 0.0044161477, -0.101613365, -0.15885417, -0.12689464, 0.091430664, -0.043584187, 0.084742226, -0.12822978, 0.030708313, -0.00078582764, 0.16898601, 0.02035141, -0.012090047, -0.07896169, 0.049631756, -0.08862559, -0.027048746, -0.2780253, 0.021583557, -0.08848318, -0.12550862, 0.008382161, 0.06955465, -0.08341471, 0.039797466, -0.056641538, -0.009038289, 0.0049438477, -0.05189824, -0.030171713, 0.0

In [None]:
# loop all the data and store the idNumber frild in a list
idNumbers = []
for value in data:
    idNumbers.append(value.get('idNumber'))

print(len(idNumbers), "idNumbers created successfully")

286 idNumbers created successfully


In [113]:
# print the data with the idNumber field is equal to the 295930
for value in data:
    if value.get('idNumber') == 328836:
        print(value)


{'category': ['Construction', 'Transportation', 'Health', 'Services'], 'closingDate': '5 Mar 2024', 'description': 'Canberra Health Services are seeking the Services of a relocation/removalist Specialist for a number of moves across The Canberra Hospital Campus and offsite Sterilising Unit in Mitchell ACT, across to the new Building 5 (Critical Services Building Project) located at the Canberra Hospital Campus.', 'idNumber': 136627, 'link': 'https://www.tenders.act.gov.au/tender/view?id=263387', 'location': ['ACT'], 'publishedDate': '5 Feb 2024', 'region': ['Not Specified'], 'title': 'Canberra Health Services Clinical & Support Services Relocation into Building 5, Canberra Hospital', 'updatedDateTime': '27/02/2024, 2:42:58 pm'}
