In [20]:
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter
from qdrant_client import QdrantClient
from llama_index.core.schema import TextNode, NodeWithScore
from sentence_transformers import SentenceTransformer
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.vector_stores import VectorStoreQuery

model = SentenceTransformer("all-MiniLM-L6-v2")

In [2]:
loader = PyMuPDFReader()
documents = loader.load(file_path="input_file/TFM_Memoria.pdf",)

In [3]:
text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))
    


nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [4]:
for node in nodes:
    node_embedding = model.encode(node.get_content(metadata_mode="all"))
    node.embedding = node_embedding

In [None]:
client = QdrantClient(location=':memory:')
vector_store = QdrantVectorStore(client=client, collection_name='RAG')
vector_store.add(nodes)

In [11]:
query_str = "Which algorithms were used?"
query_embedding = model.encode(query_str)

In [19]:
query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=5, mode=query_mode
)
query_result = vector_store.query(vector_store_query)
hits = [] 
for node in query_result.nodes:
    hits.append(node.get_content())
node


TextNode(id_='1c638ed9-99b9-44c7-b57e-c3b4d5287346', embedding=None, metadata={'total_pages': 33, 'file_path': 'input_file/TFM_Memoria.pdf', 'source': '20'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='in which given a location and 2 mutated sequences that come from the same sequence are crossed\nover. An iteration of this algorithm is a mutation for every position and all the possible crossovers.\nAfter an iteration, all the resulting sequences are evaluated and the 200 best are chosen. This value\nis a parameter and can be changed, the time of the optimization with the genetic algorithm will\nincrease exponentially as this parameter goes up. This high number was decided because of the\nthought that to get to the customer journey with the best score you probably have to go through\nnot-so-optimal sequences. This process is done 15 times. This value is a parameter and can be\nchange

In [21]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List, Optional


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: QdrantVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = self._embed_model.encode(query_bundle.query_str)
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [22]:
retriever = VectorDBRetriever(
    vector_store, model, query_mode="default", similarity_top_k=2
)