In [1]:
from collections import Counter
from langchain_voyageai import VoyageAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import os 
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
from datasets import load_dataset  # !pip install datasets
pubmed = load_dataset(
   'pubmed_qa',
   'pqa_labeled',
   split='train'
)
pubmed

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 1000
})

In [4]:
contexts = []
# loop through the context passages
for record in pubmed['context']:
   # join context passages for each question and append to contexts list
   contexts.append('\n'.join(record['contexts']))
# view some of the contexts
for context in contexts[:2]:
   print(f"{context[:300]}...")

Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cel...
Assessment of visual acuity depends on the optotypes used for measurement. The ability to recognize different optotypes differs even if their critical details appear under the same visual angle. Since optotypes are evaluated on individuals with good visual acuity and without eye disorders, differenc...


In [46]:
from transformers import BertTokenizer

# load bert tokenizer from huggingface
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased'
)

In [47]:
model = VoyageAIEmbeddings(
    voyage_api_key=os.getenv("VOYAGE_API_KEY"), model="voyage-3"
)

In [48]:

def build_dict(input_batch):
  # store a batch of sparse embeddings
    sparse_emb = []
    # iterate through input batch
    for token_ids in input_batch:
        # convert the input_ids list to a dictionary of key to frequency values
        d = dict(Counter(token_ids))
        tokenids = list(set(token_ids))
        # remove special tokens and append sparse vectors to sparse_emb list
        # sparse_emb.append({key: d[key] for key in d if key not in [101, 102, 103, 0]})
        sparse_emb.append({"indices":tokenids, "values":[float(d[id]) for id in tokenids]})
    # return sparse_emb list
    return sparse_emb

In [49]:
def generate_sparse_vectors(context_batch):
    input_ids = tokenizer(
    context_batch, padding=True, truncation=True,
     max_length=512
)["input_ids"]
    sparse_embeds = build_dict(input_ids)
    return sparse_embeds
 

In [50]:
s = generate_sparse_vectors(contexts)
print( f""" {len(s[0]["values"])}
{len(s[0]["indices"])}""")



 199
199


In [51]:
from pinecone_text.sparse import BM25Encoder

# or from pinecone_text.sparse import SpladeEncoder if you wish to work with SPLADE

# use default tf-idf values
bm25_encoder = BM25Encoder().default()

In [52]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abdullah.alzariqi\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [53]:

# fit tf-idf values on your corpus
bm25_encoder.fit(contexts)

# store the values to a json file
bm25_encoder.dump("bm25_values.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")

100%|██████████| 1000/1000 [00:06<00:00, 145.35it/s]


In [54]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("hybrid-search")

In [14]:
# from tqdm.auto import tqdm

# batch_size = 32

# for i in tqdm(range(0, len(contexts), batch_size)):
#     # find end of batch
#     i_end = min(i+batch_size, len(contexts))
#     # extract batch
#     context_batch = contexts[i:i_end]
#     # create unique IDs
#     ids = [str(x) for x in range(i, i_end)]
#     # add context passages as metadata
#     meta = [{'context': context} for context in context_batch]
#     # create dense vectors
#     dense_embeds = model.embed_documents(context_batch)
#     # create sparse vectors
#     sparse_embeds = generate_sparse_vectors(context_batch)
#     print(sparse_embeds)
#     vectors = []
#     # loop through the data and create dictionaries for uploading documents to pinecone index
#     for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
#         vectors.append({
#             'id': _id,
#             'sparse_values': sparse[0],
#             'values': dense,
#             'metadata': metadata
#         })

#     # upload the documents to the new hybrid index
#     index.upsert(vectors=vectors)

In [15]:
from tqdm.auto import tqdm


for i in tqdm(range(0, len(contexts))):
    # find end of batch
    # create unique IDs
    ids = [str(x) for x in range(i, len(contexts))]
    # add context passages as metadata
    meta = [{'context': context} for context in contexts]
    # create dense vectors
    dense_embeds = model.embed_documents(contexts)
    # create sparse vectors
    sparse_embeds = generate_sparse_vectors(contexts)
    vectors = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
        vectors.append({
            'id': _id,
            'sparse_values': sparse,
            'values': dense,
            'metadata': metadata
        })

    # upload the documents to the new hybrid index
    print("about to updsert value")
    index.upsert(vectors=vectors)
    print("upserted")

  0%|          | 0/1000 [00:00<?, ?it/s]

sparse length: 2 sparse type: <class 'dict'> sparse: {'indices': [0, 2049, 1025, 2566, 4102, 3081, 2058, 1037, 2063, 4632, 1049, 1050, 2076, 7711, 4642, 2083, 2595, 5158, 23079, 29738, 13866, 4140, 2093, 2094, 2097, 15923, 10804, 6198, 17978, 15422, 2625, 18499, 7236, 4168, 2122, 15436, 5197, 3662, 5711, 4176, 3155, 20051, 13908, 2135, 16984, 2140, 29278, 3170, 4195, 101, 4710, 102, 2164, 14967, 22648, 12922, 18554, 2174, 2176, 1158, 10381, 3727, 3215, 28817, 2193, 12436, 19098, 2714, 27804, 11934, 27806, 2206, 9890, 2213, 20134, 4264, 2220, 18606, 7352, 3259, 2239, 2752, 2241, 2754, 7361, 3269, 10949, 3273, 24269, 8920, 2274, 18150, 28911, 26872, 23290, 4353, 2306, 6913, 3332, 2309, 4360, 2825, 3858, 4372, 5397, 17175, 2331, 13595, 4383, 27937, 7458, 6948, 18724, 14119, 7473, 24887, 2361, 22330, 6459, 8010, 8523, 8524, 2891, 2896, 8017, 10066, 19797, 4442, 2397, 4958, 12126, 5484, 3949, 2415, 2417, 14194, 17779, 9587, 22901, 3449, 10626, 3972, 6022, 9607, 9099, 2444, 7053, 11663, 1832

  0%|          | 0/1000 [01:35<?, ?it/s]


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Tue, 10 Dec 2024 11:57:41 GMT', 'Content-Type': 'application/json', 'Content-Length': '118', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '3049', 'x-pinecone-request-id': '8800600207978072775', 'x-envoy-upstream-service-time': '0', 'server': 'envoy'})
HTTP response body: {"code":11,"message":"Error, message length too large: found 7685159 bytes, the limit is: 4194304 bytes","details":[]}


In [55]:
# find end of batch
# create unique IDs
ids = [str(x) for x in range(i, len(contexts))]
# add context passages as metadata
meta = [{'context': context} for context in contexts]
# create dense vectors
dense_embeds = model.embed_documents(contexts)
# create sparse vectors
sparse_embeds = generate_sparse_vectors(contexts)
vectors = []
# loop through the data and create dictionaries for uploading documents to pinecone index
for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
    vectors.append({
        'id': _id,
        'sparse_values': sparse,
        'values': dense,
        'metadata': metadata
    })

In [17]:
print(type(vectors))
print(len(vectors))

<class 'list'>
1000


In [18]:
print (f"""{len(vectors[0]["sparse_values"]["indices"])}
{len(set(vectors[0]["sparse_values"]["indices"]))}""")

199
199


In [59]:
index.upsert(vectors=vectors[801:])


{'upserted_count': 199}

In [62]:
def hybrid_query(query, k=10):
    vector = model.embed_documents(query)[0]
    sparse =  generate_sparse_vectors([query])
    return index.query(
        vector=vector,
        top_k=k,
        sparse_vector=sparse[0],
        include_metadata=True
        )

In [64]:
hybrid_query("How many HPV-positive mothers were included in the study?")

{'matches': [{'id': '188',
              'metadata': {'context': 'To determine whether successful '
                                      'completion of the Perinatal Education '
                                      'Programme (PEP) improves obstetric '
                                      'practice.\n'
                                      'The three midwife obstetric units '
                                      '(MOUs) in a health district of '
                                      'Mpumalanga were included in the study. '
                                      'Two MOUs enrolled in the PEP and the '
                                      "third did not. A 'before-and-after' "
                                      'study design was used to assess any '
                                      'changes in practice, and to monitor '
                                      'whether any changes occurred in the '
                                      'district during the time of the study; '
