In [2]:
import pandas as pd

from langchain_community.vectorstores import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

from transformers import AutoTokenizer, AutoModel
import torch

from pymilvus import FieldSchema, DataType, CollectionSchema, Collection, connections

from milvus import default_server

In [None]:
def connectToMilvus():
    try:
        connections.connect("default", host="localhost", port="19530")
        print("Connected to Milvus.")
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")
        raise

def createCollection(name, fields, description):
    schema = CollectionSchema(fields, description)
    collection = Collection(name, schema, consistency_level="Strong")
    return collection

def generateEmbeddings(tokenizer, model, docs):
    encodings = []
    
    for doc in data:
        # Encode the text to get input ids & attention mask
        encoded_input = tokenizer(doc, padding=True, truncation=True, max_length=128, return_tensors='pt')

        # Get the embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)

        embeddings.append(model_output.last_hidden_state.mean(dim=1).numpy()[0])

    return embeddings

def createLawsCollection():
    fields = [
        FieldSchema(name="law_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
        FieldSchema(name="source", dtype=DataType.VARCHAR, max_len=50),
        FieldSchema(name="article", dtype=DataType.INT64),
        FieldSchema(name="comma", dtype=DataType.INT64),
        FieldSchema(name="comma_content", dtype=DataType.VARCHAR, max_len=5000)
    ]    
    
    return createCollection("laws", fields, "Collection of laws")

def createQuizCollection():
    fields = [
        FieldSchema(name="quiz_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
        FieldSchema(name="question", dtype=DataType.VARCHAR, max_len=500),
        FieldSchema(name="answer_1", dtype=DataType.VARCHAR, max_len=1000),
        FieldSchema(name="answer_2", dtype=DataType.VARCHAR, max_len=1000),
        FieldSchema(name="answer_3", dtype=DataType.VARCHAR, max_len=1000),
        FieldSchema(name="source", dtype=DataType.VARCHAR, max_len=50)
    ]
    
    return createCollection("quizzes", fields, "Collection of quizzes")

def createReferencesCollection():
    fields = [
        FieldSchema(name="reference_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),
        FieldSchema(name="source", dtype=DataType.VARCHAR, max_len=50),
        FieldSchema(name="article", dtype=DataType.INT64),
        FieldSchema(name="comma", dtype=DataType.INT64),
        FieldSchema(name="quiz_id", dtype=DataType.INT64),
    ]
    
    return createCollection("references", fields, "Collection of references")

In [2]:
# Connect to Milvus
try:
    connections.connect("default", host="0.0.0.0")
except:
    default_server.start()
    print("Milvus server started")

# Initialize the model and tokenizer
model_name = "BAAI/bge-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Read laws, quizzes and references csv
laws_df = pd.read_csv("laws.csv")
quizzes_df = pd.read_csv("quizzes.csv")
references_df = pd.read_csv("references.csv")

# Create the collections
laws_collection = createLawsCollection()
quizzes_collection = createQuizCollection()
references_collection = createReferencesCollection()

# Generate embeddings
laws_embeddings = generateEmbeddings(tokenizer, model, laws_df["comma_content"])
quizzes_embeddings = generateEmbeddings(tokenizer, model, quizzes_df["question"])
references_embeddings = generateEmbeddings(tokenizer, model, references_df["comma_content"])

# Insert data into the collections
laws_data = []
for i in range(len(laws_df)):
    laws_data.append([i, laws_embeddings[i], laws_df["source"][i], laws_df["article"][i], laws_df["comma"][i], laws_df["comma_content"][i]])

quizzes_data = []
for i in range(len(quizzes_df)):
    quizzes_data.append([i, quizzes_embeddings[i], quizzes_df["question"][i], quizzes_df["answer_1"][i], quizzes_df["answer_2"][i], quizzes_df["answer_3"][i], quizzes_df["source"][i]])
    
references_data = []
for i in range(len(references_df)):
    references_data.append([i, references_embeddings[i], references_df["source"][i], references_df["article"][i], references_df["comma"][i], references_df["quiz_id"][i]])

laws_collection.insert(laws_data)
quizzes_collection.insert(quizzes_data)
references_collection.insert(references_data)

# Query the collections
query = "Come sono le banane?"
query_embedding = generateEmbeddings(tokenizer, model, [query])[0]

laws_collection.query(query_embedding)
quizzes_collection.query(query_embedding)
references_collection.query(query_embedding)

# Close the connection
connections.disconnect("default")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


[Document(metadata={'source': 'test', 'pk': 451780400722574584}, page_content='Le banane sono buonissime'), Document(metadata={'source': 'test', 'pk': 451780400722574592}, page_content='Le banane sono buonissime'), Document(metadata={'source': 'test', 'pk': 451780400722574596}, page_content='Le banane sono buonissime'), Document(metadata={'source': 'test', 'pk': 451780400722574588}, page_content='Le banane sono buonissime')]


In [7]:
client = Milvus(connection_args={"host": "127.0.0.1", "port": "19530"}, embedding_function=embedder)

# List all collections
collections = client.list_collections()

# Print all collections and their stats
for collection_name in collections:
    print(f"Collection name: {collection_name}")
    stats = client.get_collection_stats(collection_name)
    print(f"Stats: {stats}")

AttributeError: 'Milvus' object has no attribute 'list_collections'

: 

## Loading the Dataset into Milvus

In [None]:
import os
import pandas as pd
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from pymilvus import connections

from milvus import default_server

DEFAULT_SAVE_DIR = os.getcwd() + "/work/documents/"
CODICE_PENALE_DIR = DEFAULT_SAVE_DIR + "Codice_Penale.csv"
REFERENCES_CSV_FILE = DEFAULT_SAVE_DIR + "quiz_references.csv"

default_server.start()
connections.connect("default", host="0.0.0.0")

df = pd.read_csv(CODICE_PENALE_DIR)
data = (df.iloc[:, 0] + " " + df.iloc[:, 1]).tolist()

# Process the list to create Document objects
documents = []
for document in data:
    documents.append(Document(page_content=document, metadata={"source": "csv"}))

embedder = HuggingFaceEmbeddings(model_name="BAAI/bge-m3",
                                     model_kwargs={"device": "cuda"})

# Add the documents to the Milvus database
vector_db = Milvus.from_documents(
        documents,
        embedder, 
        connection_args={"host":  "127.0.0.1", "port": "19530"},
        collection_name="csv_data"
    )

# Load the references
df_queries = pd.read_csv(REFERENCES_CSV_FILE)

queries = df_queries.iloc[:, 1].tolist()
df_results = pd.DataFrame(columns=['Question', 'Answer'])

# For each query in the list, query the database and store the question and answer in the DataFrame
for query in queries:
    docs = vector_db.similarity_search(query)
    answer = docs[0].page_content if docs else 'No match found'
    df_results = df_results.append({'Question': query, 'Answer': answer}, ignore_index=True)

