In [7]:
from pymilvus import connections, DataType, CollectionSchema, FieldSchema, Collection
import json
import numpy as np
import gensim.downloader as api
import random




# Load the Google pretrained word2vec model
model = api.load("word2vec-google-news-300")

# Load the data from the JSON file
json_file_path = "data.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)


data
idNumbers = []
# for all the data create a unique idNumber
for i in range(len(data)):
    idNumbers.append(random.randint(100000, 999999))

    # also update the json data with the idNumber
    data[i]['idNumber'] = idNumbers[i]


print(len(idNumbers), "idNumbers created successfully")

    

# Function to calculate average vector for text
def calculate_average_vector(text):
    tokens = text.split()
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to process data and store embeddings
def process_and_store_embeddings(data):
    embeddings = []
    for value in data:

        title_vector = calculate_average_vector(value.get('title', ''))
        description_vector = calculate_average_vector(value.get('description', ''))

        combined_vector = np.concatenate([title_vector, description_vector])
        embeddings.append(combined_vector)

    return embeddings

# Call the function to process data and store embeddings
embeddings = process_and_store_embeddings(data)

print(len(embeddings), "embeddings created successfully")

# Connect to Milvus server
connections.connect(host='localhost', port='19530')

# create the fieldSchema for the collection
schema = CollectionSchema(fields=[
    FieldSchema(name="idNumber", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=600)  # Corrected dimension to match embeddings
])

# Create a collection in Milvus
collection_name = "tender_collection"
collection = Collection(
    name=collection_name,
    schema=schema,
    consistency_level="Strong"
)

# Specify the index type and parameters
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
collection.create_index("embedding", index)



# Insert embeddings into the collection
insert_result = collection.insert([
    {"idNumber": idNumbers[i], "embedding": embedding.tolist()} 
    for i, embedding in enumerate(embeddings)
])

# ======================



# After final entity is inserted, it is best to call flush to have no growing segments left in memory
collection.flush() 

# Disconnect from Milvus server
connections.disconnect(alias='default')
print("Data and embeddings stored successfully in Milvus")


286 idNumbers created successfully
286 embeddings created successfully


AttributeError: 'list' object has no attribute 'tolist'

In [44]:
from pymilvus import MilvusClient, DataType

# Set your cluster endpoint and token
CLUSTER_ENDPOINT = "https://in03-14185828311d2cb.api.gcp-us-west1.zillizcloud.com"  # Replace with your cluster endpoint
TOKEN = "0cb73d7dd401b51429331cc3eb4ac13fce72aec9f6b04557ea2d01b4aae0d0d8e82a8ee66a79bc38e8a5110ce81c03ccda0b5379"  # Replace with your API key

# Initialize a MilvusClient instance
client = MilvusClient(
    uri=CLUSTER_ENDPOINT,
    token=TOKEN
)
# Create a collection
collection_name = "tender_collection_new"
dimension = 600
index_file_size = 1024  # Specify index file size
metric_type = "L2"  # Specify the metric type
params = {"nlist": 128}  # Specify index parameters

# Specify the primary key field
primary_field = "idNumber"  # Assuming "idNumber" is the primary key field
idNumbers = []
# for all the data create a unique idNumber
for i in range(len(data)):
    idNumbers.append(random.randint(100000, 999999))

    # also update the json data with the idNumber
    data[i]['idNumber'] = idNumbers[i]


print(len(idNumbers), "idNumbers created successfully")

client.create_collection(collection_name, dimension)

# Create index for the collection
# client.create_index(collection_name, index_file_size, metric_type, params)

# Insert data into the collection
entities = [
    {"name": "A", "embedding": embeddings[i].tolist(), "idNumber": idNumbers[i]} for i in range(len(embeddings))
]

# Insert entities into the collection
client.insert(collection_name, entities)



286 idNumbers created successfully


AttributeError: 'list' object has no attribute 'tolist'

In [34]:
from pymilvus import connections, Collection
import json
import numpy as np
import gensim.downloader as api

# Connect to Milvus server
connections.connect(host='localhost', port='19530')

def calculate_average_vector(text):
    tokens = text.split()
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to process data and store embeddings
def process_and_store_embeddings(data):
    title_vector = calculate_average_vector(data.get('title', ''))
    description_vector = calculate_average_vector(data.get('description', ''))

    combined_vector = np.concatenate([title_vector, description_vector])
    
    return combined_vector

# Load the sample data from a JSON file
json_file_path = "sample.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
    sample_data = json.load(f)


# Process and store the embeddings for the sample data
sample_embedding = process_and_store_embeddings(sample_data)
# # prinnt the sample embedding sepated by a comma
print(", ".join([str(x) for x in sample_embedding]))

# print the idNumber for the sample data
print("idNumber:", sample_data.get('idNumber'))

# # Create a collection object
# collection_name = "tender_collection"
# collection = Collection(collection_name)
# collection.load()



-0.031799316, -0.06274414, 0.00962321, 0.013621013, -0.06825765, -0.0898234, 0.011698405, -0.07120514, 0.007985433, 0.042770386, -0.009953816, 0.10848999, 0.06806692, 0.096725464, -0.062830605, -0.021911621, 0.018218994, 0.09464518, 0.02266407, 0.032129925, 0.053395588, 0.005518595, 0.0454305, -0.04179891, 0.04413859, -0.020121256, -0.13870747, 0.06092326, -0.030057272, -0.11420695, 0.0060933433, -0.039698284, -0.1277364, -0.12136459, 0.013010661, -0.039942425, -0.037195843, 0.027956963, -0.07756551, -0.074289955, -0.1586914, -0.07836405, -0.07344818, -0.023173014, -0.062071484, -0.06806437, -0.115875244, 0.018147787, 0.0037841797, 0.1198527, -0.12911987, 0.047251385, -0.021728516, 0.031107584, -0.12976074, -0.006398519, -0.13111115, -0.02735583, -0.04027303, -0.09677124, -0.23838298, -0.011820476, -0.06366602, 0.00487264, 0.013631185, 0.030812582, -0.014109294, 0.120788574, -0.09252421, -0.01159668, -0.03733317, -0.113332115, -0.021118164, 0.021118164, -0.1200765, -0.18511963, 0.03289

In [35]:
# loop all the data and store the idNumber frild in a list
idNumbers = []
for value in data:
    idNumbers.append(value.get('idNumber'))

print(len(idNumbers), "idNumbers created successfully")
print(idNumbers)

286 idNumbers created successfully
[817515, 801231, 986825, 721866, 220420, 293563, 567331, 209072, 491942, 433201, 366112, 923226, 624232, 574207, 395391, 186976, 278486, 577774, 330927, 159719, 261132, 125792, 936336, 101598, 915233, 170566, 391638, 462178, 768953, 763160, 298161, 840409, 958237, 316772, 758540, 368443, 581295, 860178, 240895, 744335, 794175, 988485, 330104, 902604, 445444, 154751, 446222, 113747, 946067, 875609, 102553, 325875, 564368, 506741, 855402, 858326, 800815, 178925, 666710, 276742, 898327, 115882, 980129, 403932, 150982, 742944, 404126, 996989, 700976, 650142, 849683, 483834, 729347, 962715, 935029, 490753, 731174, 573895, 789494, 875264, 234171, 472317, 277801, 171057, 923277, 510245, 823007, 682744, 708647, 899910, 846632, 721040, 594333, 208420, 755968, 722247, 254289, 725078, 780032, 257754, 884512, 965377, 205099, 843105, 525157, 982083, 524907, 525748, 310810, 604602, 472372, 781001, 283497, 230717, 494713, 759634, 840754, 385058, 185756, 725877, 4784

In [36]:
# print the data with the idNumber field is equal to the 295930
for value in data:
    if value.get('idNumber') == 242670:
        print(value)


In [55]:
import json
import numpy as np
import gensim.downloader as api
import random

# Load the Google pretrained word2vec model
model = api.load("word2vec-google-news-300")

# Load the data from the JSON file
json_file_path = "data.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Function to calculate average vector for text
def calculate_average_vector(text):
    tokens = text.split()
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to process data and store embeddings
def process_and_store_embeddings(data):
    embeddings = []
    id_numbers = []
    for i, value in enumerate(data):
        title_vector = calculate_average_vector(value.get('title', ''))
        description_vector = calculate_average_vector(value.get('description', ''))

        combined_vector = np.concatenate([title_vector, description_vector])
        embeddings.append(combined_vector)
        id_numbers.append(value.get('idNumber', i))  # Use existing idNumber if available

    return embeddings, id_numbers

# Call the function to process data and store embeddings
embeddings, id_numbers = process_and_store_embeddings(data)

# Convert embeddings to string format with elements separated by comma
embeddings_str = [','.join(map(str, vector)) for vector in embeddings]
# convert the embeddings_str to a list
embeddings_str = [list(map(float, vector.split(','))) for vector in embeddings_str]

# Create a list of dictionaries to store the data
data_list = [{"vector": embedding_str, "dbIdNumber": id_num, "primary_key": idx} for idx, (embedding_str, id_num) in enumerate(zip(embeddings_str, id_numbers), start=1)]


# Output the data to a JSON file called "data_list.json"
with open("data_list.json", "w") as f:
    json.dump(data_list, f, indent=4)
