In [1]:
from sklearn.model_selection import train_test_split

In [2]:
from tensorflow.keras.utils import to_categorical



In [1]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import fetch_openml
from fxpmath import Fxp
import numpy as np

In [17]:
def load_data(bits=None, int_bits=None):
    X = np.load("data/X.npy")
    y = np.load("data/y.npy", allow_pickle=True)

    le = LabelEncoder()
    y = le.fit_transform(y)
    y = to_categorical(y, 5)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    if bits and int_bits:
        # cast to fixed point (n_word=16, n_int=5, signed=True overflow=wrap, rounding=floor is closest approximation to Vivado HLS ap_fixed<16,6>)
        X_train = Fxp(
            X_train,
            signed=True,
            n_word=bits,
            n_int=int_bits,
            overflow="wrap",
            rounding="floor",
        )
        X_test = Fxp(
            X_test,
            signed=True,
            n_word=bits,
            n_int=int_bits,
            overflow="wrap",
            rounding="floor",
        )
        # convert back to numpy array
        X_train = np.array(X_train, np.float64)
        X_test = np.array(X_test, np.float64)

    return X_train, X_test, y_train, y_test, le.classes_

In [18]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
import numpy as np

# Step 1: Load data
X_train, X_test, y_train, y_test, class_names = load_data()
num_features = X_train.shape[1]

# Step 2: Decode labels
y_train_decoded = [class_names[np.argmax(y)] for y in y_train]

In [None]:
from pymilvus import utility

# Step 3: Connect to Milvus
connections.disconnect("default")  #Kill the old one
connections.connect(
    alias="default",
    uri="https://in03-617b3bd148a1a68.serverless.gcp-us-west1.cloud.zilliz.com",
    user="db_617b3bd148a1a68",
    password="Or3{mZ7&~mSufLQl",
    secure=True
)

# Step 4: Define Milvus schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=num_features),
    FieldSchema(name="label", dtype=DataType.VARCHAR, max_length=20)
]

schema = CollectionSchema(fields, description="Jet classification vectors")
collection_name = "jet_vectors"

# Drop old collection if it exists
if utility.has_collection(collection_name):
    Collection(name=collection_name).drop()

collection = Collection(name=collection_name, schema=schema)

# Step 5: Insert data
embeddings = [list(vec) for vec in X_train]
labels = y_train_decoded
BATCH_SIZE = 500  # Adjust depending on your vector size

def insert_in_batches(collection, embeddings, labels, batch_size=BATCH_SIZE):
    total = len(embeddings)
    for start in range(0, total, batch_size):
        end = start + batch_size
        batch_vectors = embeddings[start:end]
        batch_labels = labels[start:end]
        collection.insert([batch_vectors, batch_labels])
        print(f"✅ Inserted {end if end < total else total} / {total}")

# Run the batch insert
insert_in_batches(collection, embeddings, labels)


# Step 6: Create index & load collection
collection.create_index(field_name="embedding", index_params={
    "metric_type": "L2", "index_type": "IVF_FLAT", "params": {"nlist": 128}
})
collection.load()

print(f"✅ Inserted {len(embeddings)} vectors into Milvus.")

✅ Inserted 500 / 664000
✅ Inserted 1000 / 664000
✅ Inserted 1500 / 664000
✅ Inserted 2000 / 664000
✅ Inserted 2500 / 664000
✅ Inserted 3000 / 664000
✅ Inserted 3500 / 664000
✅ Inserted 4000 / 664000
✅ Inserted 4500 / 664000
✅ Inserted 5000 / 664000
✅ Inserted 5500 / 664000
✅ Inserted 6000 / 664000
✅ Inserted 6500 / 664000
✅ Inserted 7000 / 664000
✅ Inserted 7500 / 664000
✅ Inserted 8000 / 664000
✅ Inserted 8500 / 664000
✅ Inserted 9000 / 664000
✅ Inserted 9500 / 664000
✅ Inserted 10000 / 664000
✅ Inserted 10500 / 664000
✅ Inserted 11000 / 664000
✅ Inserted 11500 / 664000
✅ Inserted 12000 / 664000
✅ Inserted 12500 / 664000
✅ Inserted 13000 / 664000
✅ Inserted 13500 / 664000
✅ Inserted 14000 / 664000
✅ Inserted 14500 / 664000
✅ Inserted 15000 / 664000
✅ Inserted 15500 / 664000
✅ Inserted 16000 / 664000
✅ Inserted 16500 / 664000
✅ Inserted 17000 / 664000
✅ Inserted 17500 / 664000
✅ Inserted 18000 / 664000
✅ Inserted 18500 / 664000
✅ Inserted 19000 / 664000
✅ Inserted 19500 / 664000
✅ Inse