# Adapted from https://milvus.io/docs/multi-vector-search.md

In [2]:
%%capture
from pymilvus import MilvusClient, DataType, Function, FunctionType, AnnSearchRequest, RRFRanker

client = MilvusClient("./milvus_demo.db")

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')

In [4]:
schema = client.create_schema()

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2048, enable_analyzer=True)
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="dense", datatype=DataType.FLOAT_VECTOR, dim=768)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}, {'name': 'dense', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': False}

In [5]:
bm25_function = Function(
    name="text_bm25_emb", # Function name
    input_field_names=["text"], # Name of the VARCHAR field containing raw text data
    output_field_names=["sparse"], # Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
    function_type=FunctionType.BM25, # Set to `BM25`
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}, {'name': 'dense', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['sparse'], 'params': {}}]}

In [6]:
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="sparse",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
    params={
        "inverted_index_algo": "DAAT_MAXSCORE",
        "bm25_k1": 1.2,
        "bm25_b": 0.75
    }
)

index_params.add_index(
    field_name="dense",
    index_name="text_dense_index",
    index_type="AUTOINDEX",
    metric_type="IP"
)

In [7]:
# Drop existing collection with the same name if it exists
if client.has_collection("nyu_libraries_guides"):
    client.drop_collection("nyu_libraries_guides")

In [8]:
client.create_collection(
    collection_name='nyu_libraries_guides', 
    schema=schema, 
    index_params=index_params
)

## Set up chunker for use

In [9]:
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker

converter = DocumentConverter()
chunker = HybridChunker()

In [10]:
import pickle

with open("guides.pickle", "rb") as f:
    guides = pickle.load(f)

In [11]:
for subjects in guides[ 'https://guides.nyu.edu/LaTeX']:
    print(subjects)

https://guides.nyu.edu/LaTeX


In [12]:
from tqdm import tqdm

#for guide in tqdm(["https://guides.nyu.edu/LaTeX"]):
for guide in tqdm(guides.keys()):
    for subject in guides[guide]:
        DOC_SOURCE = subject
        try:
            doc = converter.convert(source=DOC_SOURCE).document
            chunk_iter = chunker.chunk(dl_doc=doc)
            texts = [chunk.text for chunk in chunker.chunk(doc)]
    
            for text in texts:
                if "Email Me" not in text:
                    client.insert('nyu_libraries_guides', [{'text': text, 'dense': model.encode(text)}])
        except:
            pass

  6%|████▊                                                                            | 19/319 [00:21<04:35,  1.09it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
 10%|███████▊                                                                         | 31/319 [00:35<04:37,  1.04it/s]Clashing hyperlinks: 'https://www-degruyter-com.proxy.library.nyu.edu/document/doi/10.7312/orth14675/html' and 'https://search.library.nyu.edu/permalink/01NYU_INST/1d6v258/alma990045458320107876'! Chose 'https://search.library.nyu.edu/permalink/01NYU_INST/1d6v258/alma990045458320107876'
 47%|█████████████████████████████████████▎                                          | 149/319 [01:52<01:31,  1.85it/s]Clashing hyperlinks: 'https://persistent.library.nyu.edu/arch/NYU00819' and 'https://persistent.library.nyu.edu/arch/NYU02589'! Chose 'https://persistent.library.nyu.edu/arc

In [12]:
query = "How can I use Matlab for research?"

search_params_sparse = {
    "data": [query],
    "anns_field": "sparse",
    "param": {"drop_ratio_search": 0.2},
    "limit": 5,

}
sparse_request = AnnSearchRequest(**search_params_sparse)

search_params_dense = {
    "data": [model.encode(query)],
    "anns_field": "dense",
    "param": {"nprobe": 10},
    "limit": 5
}
dense_request = AnnSearchRequest(**search_params_dense)

In [13]:
reqs = [sparse_request, dense_request]
ranker = RRFRanker()

res = client.hybrid_search(
    collection_name="nyu_libraries_guides",
    reqs=reqs,
    ranker=ranker,
    limit=3,
    output_fields=["text"],  # Return id and species
)
for hits in res:
    print("Hybrid Search results:")
    for hit in hits:
        print("-----------------------------------------")
        print(f"{hit.entity.get('text')}")

Hybrid Search results:
-----------------------------------------
MATLAB  integrates numerical analysis, matrix computation, signal processing, and graphics in an easy to use environment.
- MATLAB is available for students through the Virtual Computer Lab.
- Individual licenses for faculty, staff, and students available through the NYU Computer Store.
-----------------------------------------
1. What is the most important/interesting/astounding finding from my research project?
2. How can I visually share my research with conference attendees? Should I use charts, graphs, photos, images?
3. What kind of information can I convey during my talk that will complement my poster?
-----------------------------------------
Is NMR processing software for Windows, Mac, and Linux.
- TopSpin can be downloaded for free, for academic use, from the vendor website after registering an account with your NYU email address.
- For more NMR freeware/software options, please consult the University of Washing