In [1]:
from pinecone import Pinecone, PodSpec 
from langchain.document_loaders import PyMuPDFLoader


from IPython.display import Markdown, display

  from tqdm.autonotebook import tqdm


In [2]:

from hybrid_pinecone import HybridPinecone

### Load Data
We use langchain PDF loaders to read pdfs. 

In [10]:
filepath = "data/E1. ExngTextOnly.pdf"
loader = PyMuPDFLoader(filepath)
documents = loader.load()

In [11]:
contexts = [document.page_content for document in documents]

### Sparse and Dense Embeddings

In [8]:
from tqdm.auto import tqdm

from transformers import BertTokenizerFast              # Sparse Embeddings
from sentence_transformers import SentenceTransformer   # Dense Embeddings
from collections import Counter
# SPARSE EMBEDDINGS

# load bert tokenizer from huggingface

tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased'
)

inputs = tokenizer(
    contexts[0], padding=True, truncation=True,
    max_length=512
)
inputs.keys()


# extract the input ids
input_ids = inputs['input_ids']

# convert the input_ids list to a dictionary of key to frequency values
sparse_vec = dict(Counter(input_ids))
sparse_vec

def build_dict(input_batch):
 # store a batch of sparse embeddings
   sparse_emb = []
   # iterate through input batch
   for token_ids in input_batch:
       indices = []
       values = []
       # convert the input_ids list to a dictionary of key to frequency values
       d = dict(Counter(token_ids))
       for idx in d:
            indices.append(idx)
            values.append(float(d[idx]))                        # Extremely important to cast values as float
                                                                # Otherwise you get: SparseValuesMissingKeysError: Missing required keys in data in column `sparse_values`.
       sparse_emb.append({'indices': indices, 'values': values})
   # return sparse_emb list
   return sparse_emb

def generate_sparse_vectors(context_batch):
    """
    create batch of input_ids required by pinecone for sparse vector sotrage
    """
    inputs = tokenizer(
            context_batch, padding=True,
            truncation=True,
            max_length=512
    )['input_ids']
    # create sparse dictionaries
    sparse_embeds = build_dict(inputs)
    return sparse_embeds

In [15]:
# DENSE EMBEDDINGS

# load a sentence transformer model from huggingface
model = SentenceTransformer(
    'multi-qa-MiniLM-L6-cos-v1',
    device='cpu'                    # or cuda, if available
)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### Initialize Vector Database

In [24]:
pc = HybridPinecone(
    api_key = "3c9fd500-de4d-40ed-8c77-12302b71e8ce",  # app.pinecone.io
    environment = "gcp-starter"
)

In [3]:
index_name = "hybrid-test"

In [27]:


# create the index
pc.create_index(
   index_name = index_name,
   dimension = 384,  # dimensionality of dense model
   pod_type='p1',
   metric = "dotproduct",
)

<Response [201]>

In [4]:
pc = Pinecone(
    api_key = "3c9fd500-de4d-40ed-8c77-12302b71e8ce",  # app.pinecone.io
    environment = "gcp-starter"
)
index = pc.Index(name=index_name)

In [32]:
batch_size = 50

for i in tqdm(range(0, len(contexts), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(contexts))
    # extract batch
    context_batch = contexts[i:i_end]
    # create unique IDs
    ids = [str(x) for x in range(i, i_end)]
    # add context passages as metadata
    meta = [{'context': context} for context in context_batch]
    # create dense vectors
    dense_embeds = model.encode(context_batch).tolist()
    # create sparse vectors
    sparse_embeds = generate_sparse_vectors(context_batch)

    vectors = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
        vectors.append({
            'id': _id,
            'sparse_values': sparse,
            'values': dense,
            'metadata': metadata
        })

    # upload the documents to the new hybrid index
    index.upsert(vectors)

# show index description after uploading the documents
index.describe_index_stats()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:03<00:00,  3.61s/it]


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [37]:
def hybrid_scale(dense, sparse, alpha: float):
    # check alpha value is in range
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    # scale sparse and dense vectors to create hybrid search vecs
    hsparse = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    hdense = [v * alpha for v in dense]
    return hdense, hsparse


def hybrid_query(question, top_k, alpha):
   # convert the question into a sparse vector
   sparse_vec = generate_sparse_vectors([question])[0]
   # convert the question into a dense vector
   dense_vec = model.encode([question]).tolist()[0]
   # scale alpha with hybrid_scale
   dense_vec, sparse_vec = hybrid_scale(
      dense_vec, sparse_vec, alpha
   )
   # query pinecone with the query parameters
   result = index.query(
      top_k=top_k,
      vector=dense_vec,
      sparse_vector=sparse_vec,
      include_metadata=True
   )
   # return search results as json
   return result

In [38]:
question = "What did willy wonka do with vitawonk?"

# %% [markdown]
# First, we will do a pure semantic search by setting the alpha value as 1.

# %%
hybrid_query(question, top_k=3, alpha=0.4)

{'matches': [{'id': '0',
              'metadata': {'context': 'Who are the oldest people you know? '
                                      'What are the\n'
                                      'oldest things you have (i) in your '
                                      'house, (ii) in your city,\n'
                                      'town or village? How old are they?\n'
                                      'Have you ever wished that you were '
                                      'older? Have\n'
                                      'you wished that you could grow up in a '
                                      'hurry?\n'
                                      'Mr Willy Wonka begins by inventing '
                                      'Wonka-\n'
                                      'Vite, which makes people younger. But '
                                      'Wonka-\n'
                                      'Vite is too strong. So some people '
                                  