In [52]:
import os, uuid
from tqdm import tqdm

from pinecone import Pinecone
from pinecone_text.sparse import BM25Encoder
from sentence_transformers import SentenceTransformer

from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#### Loading Corpus

In [8]:
loader = PyMuPDFLoader("data/E1. ExngTextOnly.pdf")
documents = loader.load()
doc_texts = [document.page_content for document in documents]

In [53]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=0
)
docs = text_splitter.split_documents(documents)
docs

[Document(page_content='Who are the oldest people you know? What are the\noldest things you have (i) in your house, (ii) in your city,\ntown or village? How old are they?\nHave you ever wished that you were older? Have\nyou wished that you could grow up in a hurry?', metadata={'source': 'data/E1. ExngTextOnly.pdf', 'file_path': 'data/E1. ExngTextOnly.pdf', 'page': 0, 'total_pages': 2, 'format': 'PDF 1.7', 'title': '*New Text Document.txt - Notepad', 'author': 'Manoj Kumar M', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Microsoft: Print To PDF', 'creationDate': "D:20231225192348+04'00'", 'modDate': "D:20231225192348+04'00'", 'trapped': ''}),
 Document(page_content='Mr Willy Wonka begins by inventing Wonka-\nVite, which makes people younger. But Wonka-\nVite is too strong. So some people disappear,\nbecause their age becomes Minus! One person\nactually becomes minus eighty-seven, which', metadata={'source': 'data/E1. ExngTextOnly.pdf', 'file_path': 'data/E1. ExngTextOnly.p

### Sparse Vector Encoding
We rely on `pinecone-text` client to create sparse vector representations as described in [this](https://docs.pinecone.io/docs/encode-sparse-vectors) section of the documentation.

We use the `BM-25 Encoder`, providing our text as a corpus.

#### Initialization

If you want to use the default parameters for BM25Encoder, you can call the default method. The default parameters were fitted on the `MS MARCO passage ranking dataset`.

In [6]:
# bm25 = BM25Encoder.default()

Otherwise fit on your own corpus. BM-25 calculates word frequencies in the corpus.

In [9]:
bm25 = BM25Encoder()
bm25.fit(corpus=doc_texts)

100%|██████████| 2/2 [00:00<00:00, 68.97it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x2128aa92890>

### Dense Vector Encoding
Here we may choose to use either an external API embedding service such as `OpenAI` or `PALM`. Or we could use hugging face models using the `SentenceTransformer` class.

In [20]:
model = SentenceTransformer(
    'multi-qa-MiniLM-L6-cos-v1',
    device='cpu'
)

### Upsert Sparse Dense Vectors
After we have gotten our sparse and dense embeddings, now it is time to upsert them to pinecone. We follow [this](https://docs.pinecone.io/docs/encode-sparse-vectors) section of the documentation.

#### Namespace
It is good practice to upsert to a specific namespace which we will define. This provides isolation over the different spaces we would want to search.

In [56]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_KINYASH"),
    environment='gcp-starter'
    )

In [57]:
index_name = 'hybrid-test'
namespace = 'test-namespace'

index = pc.Index(index_name) 


In [58]:
# INCOMPLETE
for document in tqdm(docs):
    texts = document.page_content

    dense_vector = model.encode(texts).tolist()
    sparse_vector = bm25.encode_documents(texts)
    
    records = [{
        'id': str(uuid.uuid4().int),
        'values': dense_vector,
        'sparse_values': sparse_vector,
        'metadata': {
            'text': texts
        }
    }]

    index.upsert(records, namespace=namespace)

100%|██████████| 15/15 [00:08<00:00,  1.76it/s]


### Query Sparse Dense Vectors
Once our documents have been succesfully upserted to the vector database, we can then query. We borrow from [this](https://docs.pinecone.io/docs/query-sparse-dense-vectors) part of the documentation.

#### Query

In [59]:
question = """
Does Mr Willy Wonka invent WonkaVite?
"""

#### Encoding Query
In our application, we will encode the actual question/text query into an sparse vector encoding. After all, the very point of this exercise is to query a database.

In [60]:
sparse_vector = bm25.encode_documents(question)

In [61]:
dense_vector = model.encode(question).tolist()

#### Querying Pinecone

In [62]:
# Check for presence of records before querying
if index.describe_index_stats()['total_vector_count'] == 0:
    print("No Records Found. Query may not retrieve any matches.")

In [63]:
query_response = index.query(
    top_k=2,
    vector=dense_vector,
    sparse_vector=sparse_vector,
    include_metadata=True,
    namespace=namespace
)

In [64]:
query_response

{'matches': [{'id': '165638081014229358505548218128268843283',
              'metadata': {'text': 'means he’s got to wait eighty-seven years\n'
                                   'before he can come back.\n'
                                   'Mr Willy Wonka must invent a new thing...\n'
                                   'Mr Wonka said, “So once again I rolled\n'
                                   'up my sleeves and set to work. Once\n'
                                   'again I squeezed my brain, searching for '
                                   'the new'},
              'score': 2.89365506,
              'values': []},
             {'id': '202351025601420347215864336724260447892',
              'metadata': {'text': 'Mr Willy Wonka begins by inventing Wonka-\n'
                                   'Vite, which makes people younger. But '
                                   'Wonka-\n'
                                   'Vite is too strong. So some people '
                             

In [51]:
# # Delete the namespace once we are done using it 
# index.delete(delete_all=True, namespace=namespace)

{}