In [1]:
from langchain_community.document_loaders import PyMuPDFLoader

In [2]:
loader = PyMuPDFLoader(
    file_path="file.pdf"
)

docs = loader.load()



In [3]:
docs[0]

Document(metadata={'source': 'file.pdf', 'file_path': 'file.pdf', 'page': 0, 'total_pages': 51, 'format': 'PDF 1.2', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Aladdin Ghostscript 5.10', 'creationDate': 'D:20020327125019', 'modDate': '', 'trapped': ''}, page_content='RFC:  791\n                                    \n                                    \n                                    \n                                    \n                                    \n                                    \n                                    \n                           INTERNET PROTOCOL\n                                    \n                                    \n                         DARPA INTERNET PROGRAM\n                                    \n                         PROTOCOL SPECIFICATION\n                                    \n                                    \n                                    \n                             September 1

In [4]:
context = ""
for doc in docs:
    context += doc.page_content + "\n"


In [5]:
context

'RFC:  791\n                                    \n                                    \n                                    \n                                    \n                                    \n                                    \n                                    \n                           INTERNET PROTOCOL\n                                    \n                                    \n                         DARPA INTERNET PROGRAM\n                                    \n                         PROTOCOL SPECIFICATION\n                                    \n                                    \n                                    \n                             September 1981\n                              prepared for\n               Defense Advanced Research Projects Agency\n                Information Processing Techniques Office\n                         1400 Wilson Boulevard\n                       Arlington, Virginia  22209\n                                   by\n       

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=528,
    chunk_overlap=128,
    length_function=len,
    is_separator_regex=False,
)

In [7]:
texts = text_splitter.create_documents([context])


In [8]:
len(texts)

239

In [9]:
from transformers import BertTokenizer

# load bert tokenizer from huggingface
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased'
)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [10]:
import google.generativeai as genai
import os
from dotenv import load_dotenv
load_dotenv()
genai.configure(api_key=os.environ["GEMINI_API_KEY"])


In [11]:
from collections import Counter

def build_dict(input_batch):
  # store a batch of sparse embeddings
    sparse_emb = []
    # iterate through input batch
    for token_ids in input_batch:
        # convert the input_ids list to a dictionary of key to frequency values
        d = dict(Counter(token_ids))
        tokenids = list(set(token_ids))
        # remove special tokens and append sparse vectors to sparse_emb list
        # sparse_emb.append({key: d[key] for key in d if key not in [101, 102, 103, 0]})
        sparse_emb.append({"indices":tokenids, "values":[float(d[id]) for id in tokenids]})
    # return sparse_emb list
    return sparse_emb

In [12]:
def generate_sparse_vectors(context_batch):
    input_ids = tokenizer(
    context_batch, padding=True, truncation=True,
     max_length=512
)["input_ids"]
    sparse_embeds = build_dict(input_ids)
    return sparse_embeds

In [13]:
contexts = []
for text in texts:
    contexts.append(text.page_content)

s = generate_sparse_vectors(contexts)
print( f""" {len(s[0]["values"])}
{len(s[0]["indices"])}""")

 13
13


In [14]:
len(contexts)

239

In [15]:
print(len(s))

239


In [23]:
from pinecone import Pinecone
import os 
pc = Pinecone(api_key= os.getenv("PINECONE_API_KEY"))
index = pc.Index("contextual-retreival")

In [17]:
def generate_dense_embeddings(texts):
    content = [text.page_content for text in texts]
    return genai.embed_content(
            model="models/text-embedding-004",
            task_type="RETRIEVAL_DOCUMENT",
            content=content
        )["embedding"]

In [18]:

ids = [str(x) for x in range(len(texts))]
# add context passages as metadata
meta = [{'context': text.page_content} for text in texts]
# create dense vectors
dense_embeds = generate_dense_embeddings(texts)
print(f"length of dense: {len(dense_embeds)}")
# create sparse vectors
sparse_embeds = generate_sparse_vectors(contexts)
print(f"length of sparse: {len(sparse_embeds)}")
vectors = []
# loop through the data and create dictionaries for uploading documents to pinecone index
for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
    vectors.append({
        'id': _id,
        'sparse_values': sparse,
        'values': dense,
        'metadata': metadata
    })

    # upload the documents to the new hybrid index



length of dense: 239
length of sparse: 239


In [19]:
vectors[1]

{'id': '1',
 'sparse_values': {'indices': [0,
   2436,
   2692,
   8459,
   12827,
   2592,
   2470,
   4267,
   20652,
   3639,
   3261,
   4034,
   2244,
   19015,
   8778,
   4810,
   2005,
   5461,
   6364,
   3934,
   3935,
   101,
   102,
   13929,
   1010,
   3448,
   2683],
  'values': [356.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0]},
 'values': [0.061814673,
  0.00047982708,
  -0.016774185,
  0.032145806,
  -0.002156558,
  -0.0092445165,
  0.008331414,
  0.041062824,
  0.0061981757,
  0.06547252,
  0.017077994,
  -0.0054705157,
  0.034971133,
  0.025871674,
  -0.033812642,
  -0.007926956,
  0.040229812,
  0.07103817,
  -0.07801864,
  0.01881616,
  -0.0065882397,
  -0.04600571,
  0.015062589,
  -0.016017502,
  0.029412627,
  0.0055884994,
  0.015266698,
  -0.034371935,
  -0.015238943,
  -0.070851564,
  0.00978480

In [24]:
print("about to updsert value")
index.upsert(vectors=vectors[3:3])
print("upserted")

about to updsert value
upserted


In [None]:
[{'context': text.page_content} for text in texts]