## Dataset preparation

In [1]:
from datasets import load_dataset

# Step 1: Load the SQuAD dataset
dataset = load_dataset("squad")

# Step 2: Extract unique contexts from the dataset
data = [item["context"] for item in dataset["train"]]
texts = list(set(data))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
texts[0]

"Much of Yale University's staff, including most maintenance staff, dining hall employees, and administrative staff, are unionized. Clerical and technical employees are represented by Local 34 of UNITE HERE and service and maintenance workers by Local 35 of the same international. Together with the Graduate Employees and Students Organization (GESO), an unrecognized union of graduate employees, Locals 34 and 35 make up the Federation of Hospital and University Employees. Also included in FHUE are the dietary workers at Yale-New Haven Hospital, who are members of 1199 SEIU. In addition to these unions, officers of the Yale University Police Department are members of the Yale Police Benevolent Association, which affiliated in 2005 with the Connecticut Organization for Public Safety Employees. Finally, Yale security officers voted to join the International Union of Security, Police and Fire Professionals of America in fall 2010 after the National Labor Relations Board ruled they could not

## Embed dataset

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

def batch_iterate(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i : i + batch_size]

class EmbedData:
    def __init__(self, 
                 embed_model_name="nomic-ai/nomic-embed-text-v1.5",
                 batch_size=32):
        
        self.embed_model_name = embed_model_name
        self.embed_model = self._load_embed_model()
        self.batch_size = batch_size
        self.embeddings = []
        
    def _load_embed_model(self):
        embed_model = HuggingFaceEmbedding(model_name=self.embed_model_name,
                                           trust_remote_code=True,
                                           cache_folder='./hf_cache')
        return embed_model
    
    def generate_embedding(self, context):
        return self.embed_model.get_text_embedding_batch(context)
    
    def embed(self, contexts):
        self.contexts = contexts
        
        for batch_context in tqdm(batch_iterate(contexts, self.batch_size),
                                  total=len(contexts)//self.batch_size,
                                  desc="Embedding data in batches"):
                                  
            batch_embeddings = self.generate_embedding(batch_context)
            
            self.embeddings.extend(batch_embeddings)

In [4]:
from tqdm.auto import tqdm

batch_size = 32

embeddata = EmbedData(batch_size=batch_size)

<All keys matched successfully>


In [9]:
embeddata.embed(texts[:100]) # Embed the first two contexts

Embedding data in batches: 4it [00:48, 12.08s/it]                       


In [10]:
# import pickle

# with open("embeddata_full.pickle", "rb") as h:
#     embeddata = pickle.load(h)

In [12]:
embeddata.contexts[0]

"Much of Yale University's staff, including most maintenance staff, dining hall employees, and administrative staff, are unionized. Clerical and technical employees are represented by Local 34 of UNITE HERE and service and maintenance workers by Local 35 of the same international. Together with the Graduate Employees and Students Organization (GESO), an unrecognized union of graduate employees, Locals 34 and 35 make up the Federation of Hospital and University Employees. Also included in FHUE are the dietary workers at Yale-New Haven Hospital, who are members of 1199 SEIU. In addition to these unions, officers of the Yale University Police Department are members of the Yale Police Benevolent Association, which affiliated in 2005 with the Connecticut Organization for Public Safety Employees. Finally, Yale security officers voted to join the International Union of Security, Police and Fire Professionals of America in fall 2010 after the National Labor Relations Board ruled they could not

## Vector database

In [18]:
from qdrant_client import QdrantClient, models

class QdrantVDB:

    def __init__(self, collection_name, vector_dim=768, batch_size=512):
        self.collection_name = collection_name
        self.batch_size = batch_size
        self.vector_dim = vector_dim
    
    def define_client(self):
        self.client = QdrantClient(url="http://localhost:6333",
                                   prefer_grpc=True)
    
    def create_collection(self):
        
        if not self.client.collection_exists(collection_name=self.collection_name):

            self.client.create_collection(collection_name=self.collection_name,
                                          
                                          vectors_config=models.VectorParams(
                                                              size=self.vector_dim,
                                                              distance=models.Distance.DOT,
                                                              on_disk=True),
                                          
                                          optimizers_config=models.OptimizersConfigDiff(
                                                                            default_segment_number=5,
                                                                            indexing_threshold=0)
                                         )
    
    def ingest_data(self, embeddata):
    
        for batch_context, batch_embeddings in tqdm(zip(batch_iterate(embeddata.contexts, self.batch_size), 
                                                        batch_iterate(embeddata.embeddings, self.batch_size)), 
                                                    total=len(embeddata.contexts)//self.batch_size, 
                                                    desc="Ingesting in batches"):
        
            self.client.upload_collection(collection_name=self.collection_name,
                                        vectors=batch_embeddings,
                                        payload=[{"context": context} for context in batch_context])

        self.client.update_collection(collection_name=self.collection_name,
                                    optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000)
                                    )

In [19]:
database = QdrantVDB("squad_collection")
database.define_client()
database.create_collection()
database.ingest_data(embeddata)

Ingesting in batches: 1it [00:00,  5.51it/s]


## Retrievar class

In [21]:
import time

class Retriever:

    def __init__(self, vector_db, embeddata):
        
        self.vector_db = vector_db
        self.embeddata = embeddata
    
    def search(self, query):
        query_embedding = self.embeddata.embed_model.get_query_embedding(query)
            
        # Start the timer
        start_time = time.time()
        
        result = self.vector_db.client.search(
            collection_name=self.vector_db.collection_name,
            
            query_vector=query_embedding,
            
            search_params=models.SearchParams(
                quantization=models.QuantizationSearchParams(
                    ignore=True,
                    rescore=True,
                    oversampling=2.0,
                )
            ),
            
            timeout=1000,
        )
        
        # End the timer
        end_time = time.time()
        elapsed_time = end_time - start_time

        print(f"Execution time for the search: {elapsed_time:.4f} seconds")

        return result

In [22]:
Retriever(database, embeddata).search("Sample query")[0]

Execution time for the search: 0.0597 seconds


ScoredPoint(id='34de81d7-89ea-4435-a86b-8e755a9570fe', version=1, score=0.4788297414779663, payload={'context': 'Phonology also includes topics such as phonotactics (the phonological constraints on what sounds can appear in what positions in a given language) and phonological alternation (how the pronunciation of a sound changes through the application of phonological rules, sometimes in a given order which can be feeding or bleeding,) as well as prosody, the study of suprasegmentals and topics such as stress and intonation.'}, vector=None, shard_key=None, order_value=None)