In [34]:
import os, uuid
from tqdm import tqdm
from pinecone import Pinecone
from pinecone_text.sparse import BM25Encoder
from langchain.document_loaders import PyMuPDFLoader
from sentence_transformers import SentenceTransformer

#### Loading Corpus

In [8]:
loader = PyMuPDFLoader("data/E1. ExngTextOnly.pdf")
documents = loader.load()
doc_texts = [document.page_content for document in documents]

### Sparse Vector Encoding
We rely on `pinecone-text` client to create sparse vector representations as described in [this](https://docs.pinecone.io/docs/encode-sparse-vectors) section of the documentation.

We use the `BM-25 Encoder`, providing our text as a corpus.

#### Initialization

If you want to use the default parameters for BM25Encoder, you can call the default method. The default parameters were fitted on the `MS MARCO passage ranking dataset`.

In [6]:
# bm25 = BM25Encoder.default()

Otherwise fit on your own corpus. BM-25 calculates word frequencies in the corpus.

In [9]:
bm25 = BM25Encoder()
bm25.fit(corpus=doc_texts)

100%|██████████| 2/2 [00:00<00:00, 68.97it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x2128aa92890>

### Dense Vector Encoding
Here we may choose to use either an external API embedding service such as `OpenAI` or `PALM`. Or we could use hugging face models using the `SentenceTransformer` class.

In [20]:
model = SentenceTransformer(
    'multi-qa-MiniLM-L6-cos-v1',
    device='cpu'
)

### Upsert Sparse Dense Vectors
After we have gotten our sparse and dense embeddings, now it is time to upsert them to pinecone. We follow [this](https://docs.pinecone.io/docs/encode-sparse-vectors) section of the documentation.

In [17]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_KINYASH"),
    environment='gcp-starter'
    )

index_name = 'hybrid-test'
index = pc.Index(index_name) 


In [35]:
# INCOMPLETE
for document in tqdm(documents):
    texts = document.page_content

    dense_vector = model.encode(texts).tolist()
    sparse_vector = bm25.encode_documents(texts)
    
    records = [{
        'id': str(uuid.uuid4().int),
        'values': dense_vector,
        'sparse_values': sparse_vector,
        'metadata': {
            'text': texts
        }
    }]

    index.upsert(records)

100%|██████████| 2/2 [00:01<00:00,  1.30it/s]


### Query Sparse Dense Vectors
Once our documents have been succesfully upserted to the vector database, we can then query. We borrow from [this](https://docs.pinecone.io/docs/query-sparse-dense-vectors) part of the documentation.

#### Query

In [37]:
question = """
Does Mr Willy Wonka invent WonkaVite?
"""

#### Encoding Query
In our application, we will encode the actual question/text query into an sparse vector encoding. After all, the very point of this exercise is to query a database.

In [38]:
sparse_vector = bm25.encode_documents(question)

In [39]:
dense_vector = model.encode(question).tolist()

#### Querying Pinecone

In [40]:
# Check for presence of records before querying
if index.describe_index_stats()['total_vector_count'] == 0:
    print("No Records Found. Query may not retrieve any matches.")

In [45]:
query_response = index.query(
    top_k=2,
    vector=dense_vector,
    sparse_vector=sparse_vector,
    include_metadata=True
)

In [46]:
query_response

{'matches': [{'id': '287940271558922032561645562711650873195',
              'metadata': {'text': 'Who are the oldest people you know? What '
                                   'are the\n'
                                   'oldest things you have (i) in your house, '
                                   '(ii) in your city,\n'
                                   'town or village? How old are they?\n'
                                   'Have you ever wished that you were older? '
                                   'Have\n'
                                   'you wished that you could grow up in a '
                                   'hurry?\n'
                                   'Mr Willy Wonka begins by inventing Wonka-\n'
                                   'Vite, which makes people younger. But '
                                   'Wonka-\n'
                                   'Vite is too strong. So some people '
                                   'disappear,\n'
                          