# Adapted from https://milvus.io/docs/full-text-search.md

In [1]:
%%capture
from pymilvus import MilvusClient, DataType, Function, FunctionType

client = MilvusClient("./milvus_demo.db")

In [2]:
schema = client.create_schema()

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2048, enable_analyzer=True)
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}], 'enable_dynamic_field': False}

In [3]:
bm25_function = Function(
    name="text_bm25_emb", # Function name
    input_field_names=["text"], # Name of the VARCHAR field containing raw text data
    output_field_names=["sparse"], # Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
    function_type=FunctionType.BM25, # Set to `BM25`
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['sparse'], 'params': {}}]}

In [4]:
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="sparse",

    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
    params={
        "inverted_index_algo": "DAAT_MAXSCORE",
        "bm25_k1": 1.2,
        "bm25_b": 0.75
    }

)

In [5]:
# Drop existing collection with the same name if it exists
if client.has_collection("nyu_libraries_guides"):
    client.drop_collection("nyu_libraries_guides")

In [6]:
client.create_collection(
    collection_name='nyu_libraries_guides', 
    schema=schema, 
    index_params=index_params
)

## Set up chunker for use

In [7]:
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker

converter = DocumentConverter()
chunker = HybridChunker()

In [8]:
import pickle

with open("guides.pickle", "rb") as f:
    guides = pickle.load(f)

In [9]:
for subjects in guides[ 'https://guides.nyu.edu/LaTeX']:
    print(subjects)

https://guides.nyu.edu/LaTeX


In [10]:
from tqdm import tqdm

#for guide in tqdm(["https://guides.nyu.edu/LaTeX"]):
for guide in tqdm(guides.keys()):
    for subject in guides[guide]:
        DOC_SOURCE = subject
        try:
            doc = converter.convert(source=DOC_SOURCE).document
            chunk_iter = chunker.chunk(dl_doc=doc)
            texts = [chunk.text for chunk in chunker.chunk(doc)]
    
            for text in texts:
                if "Email Me" not in text:
                    client.insert('nyu_libraries_guides', [{'text': text},])
        except:
            pass

  6%|████████▏                                                                                                                                 | 19/319 [00:05<01:23,  3.61it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 319/319 [01:24<00:00,  3.80it/s]


In [11]:
query = "Matlab?"

search_params = {
    'params': {'drop_ratio_search': 0.2},
} 

num_retreive = 3

retreived_chunks = client.search(
    collection_name='nyu_libraries_guides', 
    data=[query],
    anns_field='sparse',
    output_fields=['text'], # Fields to return in search results; sparse field cannot be output
    limit=num_retreive,
    search_params=search_params
)

for hits in iter(retreived_chunks):
    for hit in hits:
        print("-----------------------------------------")
        print(f"BM-25 score is {hit.entity.get('distance')}")
        print(f"Retreived text:\n{hit.entity.get('text')}")

-----------------------------------------
BM-25 score is 9.077797889709473
Retreived text:
MATLAB  integrates numerical analysis, matrix computation, signal processing, and graphics in an easy to use environment.
- MATLAB is available for students through the Virtual Computer Lab.
- Individual licenses for faculty, staff, and students available through the NYU Computer Store.
-----------------------------------------
BM-25 score is 7.517805099487305
Retreived text:
A guide to download/access scientific software packages on the New York campus.
- Home
- Cambridge Structural Database
- ChemDraw
- EndNote
- Gaussian and Gaussview
- Logger Pro
- Mathematica
- MATLAB
- OriginPro
- Signals ELN
- TopSpin
- Zotero
-----------------------------------------
BM-25 score is 5.754456043243408
Retreived text:
Resources and support for statistical and numerical data analysis
- Home
- SPSS
- Stata
- SAS
- R
- MATLAB
- JMP
- Python
- Excel
- SQL
- Merging Data Sets
- Reshaping Data Sets
- Choosing a St

In [12]:
query = "How can I use Matlab for research?"

search_params = {
    'params': {'drop_ratio_search': 0.2},
} 

num_retreive = 3

retreived_chunks = client.search(
    collection_name='nyu_libraries_guides', 
    data=[query],
    anns_field='sparse',
    output_fields=['text'], # Fields to return in search results; sparse field cannot be output
    limit=num_retreive,
    search_params=search_params
)

for hits in iter(retreived_chunks):
    for hit in hits:
        print("-----------------------------------------")
        print(f"BM-25 score is {hit.entity.get('distance')}")
        print(f"Retreived text:\n{hit.entity.get('text')}")

-----------------------------------------
BM-25 score is 16.000640869140625
Retreived text:
1. What is the most important/interesting/astounding finding from my research project?
2. How can I visually share my research with conference attendees? Should I use charts, graphs, photos, images?
3. What kind of information can I convey during my talk that will complement my poster?
-----------------------------------------
BM-25 score is 11.837261199951172
Retreived text:
- NYU Libraries for Beginners
- Surviving the Stacks: How to Navigate and Use the Physical Collections in Bobst Library
- Managing Your Research and Bibliographies with Citation Tools (EndNote, RefWorks, Zotero)
- U.S. Research Libraries for International Students
- Where in the World Can I Get This Book?: A Guide to Bobst Library Access Services
- Introduction to Data Services
- Finding Non-English Language Materials at NYU and Beyond
- Introduction to NYU Special Collections
-----------------------------------------
BM-25