In [50]:
from langchain_community.document_loaders import PyMuPDFLoader
import os
from dotenv import load_dotenv
load_dotenv()

True

In [51]:
loader = PyMuPDFLoader(
    file_path="file.pdf"
)

docs = loader.load()



In [52]:
docs[32]

Document(metadata={'source': 'file.pdf', 'file_path': 'file.pdf', 'page': 32, 'total_pages': 51, 'format': 'PDF 1.2', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Aladdin Ghostscript 5.10', 'creationDate': 'D:20020327125019', 'modDate': '', 'trapped': ''}, page_content='September 1981                                                          \n                                                       Internet Protocol\n                                                           Specification\n             TL <- OTL - NFB*8 - (OIHL-IHL)*4);\n             FO <- OFO + NFB;  MF <- OMF;  Recompute Checksum;\n        (10) Submit this fragment to the fragmentation test; DONE.\n      In the above procedure each fragment (except the last) was made\n      the maximum allowable size.  An alternative might produce less\n      than the maximum size datagrams.  For example, one could implement\n      a fragmentation procedure that repeatly divided large datagrams 

In [53]:
context = ""
for doc in docs:
    context += doc.page_content + "\n"


In [54]:
context

'RFC:  791\n                                    \n                                    \n                                    \n                                    \n                                    \n                                    \n                                    \n                           INTERNET PROTOCOL\n                                    \n                                    \n                         DARPA INTERNET PROGRAM\n                                    \n                         PROTOCOL SPECIFICATION\n                                    \n                                    \n                                    \n                             September 1981\n                              prepared for\n               Defense Advanced Research Projects Agency\n                Information Processing Techniques Office\n                         1400 Wilson Boulevard\n                       Arlington, Virginia  22209\n                                   by\n       

In [55]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=528,
    chunk_overlap=128,
    length_function=len,
    is_separator_regex=False,
)

In [56]:
texts = text_splitter.create_documents([context])


In [57]:
len(texts)

239

In [58]:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [59]:
###INSERT CONTEXT
def insertContext(text, full_doc):
    content = text.page_content
    res  = client.chat.completions.create(
    model="gpt-4o",
    store=True,
    messages=[
        {"role": "system", "content": f"please generate appropriate context for the provided chunk. Please note that the added context should include information that is in the following document but not in the chunk. Documnet: \n {full_doc}."},
        {"role": "user", "content": f"chunk: {content}"}
    ])
    context = res.choices[0].message.content
    cached_tokens = res.usage.prompt_tokens_details.cached_tokens
    return content + context 

In [60]:
from transformers import BertTokenizer

# load bert tokenizer from huggingface
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased'
)

In [61]:
import google.generativeai as genai

genai.configure(api_key=os.environ["GEMINI_API_KEY"])


In [62]:
from collections import Counter

def build_dict(input_batch):
  # store a batch of sparse embeddings
    sparse_emb = []
    # iterate through input batch
    for token_ids in input_batch:
        # convert the input_ids list to a dictionary of key to frequency values
        d = dict(Counter(token_ids))
        tokenids = list(set(token_ids))
        # remove special tokens and append sparse vectors to sparse_emb list
        # sparse_emb.append({key: d[key] for key in d if key not in [101, 102, 103, 0]})
        sparse_emb.append({"indices":tokenids, "values":[float(d[id]) for id in tokenids]})
    # return sparse_emb list
    return sparse_emb

In [63]:
def generate_sparse_vectors(context_batch):
    input_ids = tokenizer(
    context_batch, padding=True, truncation=True,
     max_length=512
)["input_ids"]
    sparse_embeds = build_dict(input_ids)
    return sparse_embeds

In [64]:
contexts = []
for text in texts:
    contexts.append(text.page_content)

s = generate_sparse_vectors(contexts)
print( f""" {len(s[0]["values"])}
{len(s[0]["indices"])}""")

 13
13


In [65]:
len(contexts)

239

In [66]:
print(len(s))

239


In [68]:
for text in texts[1:11]:
    text.page_content = insertContext(text, context)

In [69]:
from pinecone import Pinecone
import os 
pc = Pinecone(api_key= os.getenv("PINECONE_API_KEY"))
index = pc.Index("contextual-retriever")

In [70]:
def generate_dense_embeddings(texts):
    content = [text.page_content for text in texts]
    return genai.embed_content(
            model="models/text-embedding-004",
            task_type="RETRIEVAL_DOCUMENT",
            content=content
        )["embedding"]

In [71]:

ids = [str(x) for x in range(len(texts[1:11]))]
# add context passages as metadata
meta = [{'context': text.page_content} for text in texts[1:11]]
# create dense vectors
dense_embeds = generate_dense_embeddings(texts[1:11])
print(f"length of dense: {len(dense_embeds)}")
# create sparse vectors
sparse_embeds = generate_sparse_vectors(contexts[1:11])
print(f"length of sparse: {len(sparse_embeds)}")
vectors = []
# loop through the data and create dictionaries for uploading documents to pinecone index
for _id, sparse, dense, metadata in zip(ids, sparse_embeds, dense_embeds, meta):
    vectors.append({
        'id': _id,
        'sparse_values': sparse,
        'values': dense,
        'metadata': metadata
    })

    # upload the documents to the new hybrid index



length of dense: 10
length of sparse: 10


In [72]:
vectors

[{'id': '0',
  'sparse_values': {'indices': [0,
    2436,
    2692,
    8459,
    12827,
    2592,
    2470,
    4267,
    20652,
    3639,
    3261,
    4034,
    2244,
    19015,
    8778,
    4810,
    2005,
    5461,
    6364,
    3934,
    3935,
    101,
    102,
    13929,
    1010,
    3448,
    2683],
   'values': [356.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0]},
  'values': [0.042605113,
   -0.0049933568,
   -0.02562604,
   0.01762642,
   -0.018615505,
   0.022974709,
   0.0015664456,
   0.052019157,
   0.04433173,
   0.05929568,
   -0.018253913,
   -0.030642688,
   0.054468166,
   0.031100253,
   -0.015829753,
   -0.029598905,
   0.0070221582,
   0.03330683,
   -0.074516244,
   -0.024150377,
   0.008464657,
   -0.0693156,
   -0.0052735275,
   -0.006533413,
   0.015452107,
   0.02595929

In [73]:
print("about to updsert value")
index.upsert(vectors=vectors)
print("upserted")

about to updsert value
upserted


In [None]:
[{'context': text.page_content} for text in texts]