# Adapted from https://milvus.io/docs/full-text-search.md

In [1]:
from pymilvus import MilvusClient, DataType, Function, FunctionType

client = MilvusClient("./milvus_demo.db")

schema = client.create_schema()

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2048, enable_analyzer=True)
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)

  from pkg_resources import DistributionNotFound, get_distribution


{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}], 'enable_dynamic_field': False}

In [2]:
bm25_function = Function(
    name="text_bm25_emb", # Function name
    input_field_names=["text"], # Name of the VARCHAR field containing raw text data
    output_field_names=["sparse"], # Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
    function_type=FunctionType.BM25, # Set to `BM25`
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['sparse'], 'params': {}}]}

In [3]:
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="sparse",

    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
    params={
        "inverted_index_algo": "DAAT_MAXSCORE",
        "bm25_k1": 1.2,
        "bm25_b": 0.75
    }

)

In [4]:
# Drop existing collection with the same name if it exists
if client.has_collection("nyu_libraries_guides"):
    client.drop_collection("nyu_libraries_guides")

In [5]:
client.create_collection(
    collection_name='nyu_libraries_guides', 
    schema=schema, 
    index_params=index_params
)

## Set up chunker for use

In [6]:
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker

converter = DocumentConverter()
chunker = HybridChunker()

In [7]:
import pickle

with open("guides.pickle", "rb") as f:
    guides = pickle.load(f)

In [8]:
for subjects in guides[ 'https://guides.nyu.edu/LaTeX']:
    print(subjects)

https://guides.nyu.edu/LaTeX/getting-started
https://guides.nyu.edu/LaTeX/exercises
https://guides.nyu.edu/LaTeX/templates
https://guides.nyu.edu/LaTeX/bibtex
https://guides.nyu.edu/LaTeX/library-workshop-files
https://guides.nyu.edu/LaTeX/resources
https://guides.nyu.edu/LaTeX/creating-document
https://guides.nyu.edu/LaTeX
https://guides.nyu.edu/LaTeX/home
https://guides.nyu.edu/LaTeX/sample-document
https://guides.nyu.edu/LaTeX/installation


In [9]:
from tqdm import tqdm

#for guide in tqdm(guides.keys()):
for guide in tqdm(["https://guides.nyu.edu/LaTeX"]):
    for subject in guides[guide]:
        DOC_SOURCE = subject
        try:
            doc = converter.convert(source=DOC_SOURCE).document
            chunk_iter = chunker.chunk(dl_doc=doc)
            texts = [chunk.text for chunk in chunker.chunk(doc)]
    
            for text in texts:
                if "Email Me" not in text:
                    client.insert('nyu_libraries_guides', [{'text': text},])
        except:
            pass

  0%|                                                                                                                                                     | 0/1 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1993 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.20s/it]


In [10]:
search_params = {
    'params': {'drop_ratio_search': 0.2},
}

num_retreive = 5

retreived_chunks = client.search(
    collection_name='nyu_libraries_guides', 
    data=['How can I use Matlab?'],
    anns_field='sparse',
    output_fields=['text'], # Fields to return in search results; sparse field cannot be output
    limit=num_retreive,
    search_params=search_params
)

for hits in iter(retreived_chunks):
    # gets the distances to the query vector from all returned hits
    print(hits.distances)
    for hit in hits:
        print("-----------------------------------------")
        print(hit.entity.get('text'))

[6.13486909866333, 5.829623699188232, 4.918405055999756, 4.715087413787842, 4.628559589385986]
-----------------------------------------
\textbf{Hello World!} Today I am learning \LaTeX. %notice how the command will end at the first non-alphabet charecter such as the . after \LaTeX
\LaTeX{} is a great program for writing math. I can write in line math such as $a^2+b^2=c^2$ %$ tells LaTexX to compile as math
. I can also give equations their own space:
\begin{equation} % Creates an equation environment and is compiled as math
\gamma^2+\theta^2=\omega^2
\end{equation}
If I do not leave any blank lines \LaTeX{} will continue  this text without making it into a new paragraph.  Notice how there was no indentation in the text after equation (1).
Also notice how even though I hit enter after that sentence and here $\downarrow$
\LaTeX{} formats the sentence without any break.  Also   look  how      it   doesn't     matter          how    many  spaces     I put     between       my    words.
Fo

In [11]:
search_params = {
    'params': {'drop_ratio_search': 0.2},
}

num_retreive = 5

retreived_chunks = client.search(
    collection_name='nyu_libraries_guides', 
    data=['How can I use Matlab for research?'],
    anns_field='sparse',
    output_fields=['text'], # Fields to return in search results; sparse field cannot be output
    limit=num_retreive,
    search_params=search_params
)

for hits in iter(retreived_chunks):
    # gets the distances to the query vector from all returned hits
    print(hits.distances)
    for hit in hits:
        print("-----------------------------------------")
        print(hit.entity.get('text'))

[7.651721954345703, 6.582129001617432, 6.455765724182129, 6.118903636932373, 6.002688407897949]
-----------------------------------------
\textbf{Hello World!} Today I am learning \LaTeX. %notice how the command will end at the first non-alphabet charecter such as the . after \LaTeX
\LaTeX{} is a great program for writing math. I can write in line math such as $a^2+b^2=c^2$ %$ tells LaTexX to compile as math
. I can also give equations their own space:
\begin{equation} % Creates an equation environment and is compiled as math
\gamma^2+\theta^2=\omega^2
\end{equation}
If I do not leave any blank lines \LaTeX{} will continue  this text without making it into a new paragraph.  Notice how there was no indentation in the text after equation (1).
Also notice how even though I hit enter after that sentence and here $\downarrow$
\LaTeX{} formats the sentence without any break.  Also   look  how      it   doesn't     matter          how    many  spaces     I put     between       my    words.
F