In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_nvidia_ai_endpoints import ChatNVIDIA
import tqdm
from dotenv import load_dotenv
load_dotenv()

True

In [3]:

llm = ChatNVIDIA(
  model="meta/llama-3.3-70b-instruct",
  api_key="DONT STEAL MY API GET YOUR OWN", 
  temperature=0.2,
)
  


In [4]:

# Text splitting function
def split_text(texts):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    doc_chunks = text_splitter.create_documents(texts)
    for i, doc in enumerate(doc_chunks):
        doc.metadata = {
            "page_number": i + 1
        }
    return doc_chunks

# Context addition prompts
prompt_document = PromptTemplate(
    input_variables=["WHOLE_DOCUMENT"], template="{WHOLE_DOCUMENT}"
)

prompt_chunk = PromptTemplate(
    input_variables=["CHUNK_CONTENT"],
    template="Here is the chunk we want to situate within the whole document\n\n{CHUNK_CONTENT}\n\n"
    "Please give a short succinct context to situate this chunk within the overall document for "
    "the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else.",
)


In [5]:
# Create contextual chunks
def create_contextual_chunks(chunks_, llm, whole_document):
    contextual_documents = []
    for chunk in tqdm.tqdm(chunks_):
        context = prompt_document.format(WHOLE_DOCUMENT=whole_document)
        chunk_context = prompt_chunk.format(CHUNK_CONTENT=chunk.page_content)
        llm_response = llm.invoke(context + chunk_context)
        page_content = f"Text: {chunk.page_content}\n\nContext: {llm_response.content}"
        doc = Document(page_content=page_content, metadata=chunk.metadata)
        contextual_documents.append(doc)
    return contextual_documents

# Initialize Google Generative AI embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")



In [None]:
# import os
# from pathlib import Path

# # Modify the create_vector_store function to support persistence
# def create_vector_store(documents, embeddings, persist_dir="study_materials"):
#     # Create persist directory if it doesn't exist
#     Path(persist_dir).mkdir(parents=True, exist_ok=True)
    
#     return Chroma.from_documents(
#         documents, 
#         embeddings,
#         persist_directory=persist_dir
#     )

# # Add function to process multiple PDFs
# def process_pdfs_to_vectorstore():
#     pdf_folder = "SPARK"
#     persist_dir = "study_materials"
    
#     # Get list of PDFs
#     pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
#     all_contextual_chunks = []
    
#     for pdf_file in pdf_files:
#         print(f"Processing {pdf_file}...")
#         pdf_path = os.path.join(pdf_folder, pdf_file)
        
#         # Load PDF
#         pdf_loader = PyPDFLoader(pdf_path)
#         raw_documents = pdf_loader.load()
#         raw_text = " ".join([doc.page_content for doc in raw_documents])
        
#         # Split text
#         chunks = split_text([raw_text])
        
#         # Add source document metadata
#         for chunk in chunks:
#             chunk.metadata["source"] = pdf_file
            
#         # Create contextual chunks
#         contextual_chunks = create_contextual_chunks(chunks, llm, raw_text)
#         all_contextual_chunks.extend(contextual_chunks)
#         print(f"Added {len(contextual_chunks)} chunks from {pdf_file}")
    
#     # Create and persist vector store with all documents
#     vector_store = create_vector_store(all_contextual_chunks, embeddings)
#     vector_store.persist()
#     print(f"Created persistent vector store with {len(all_contextual_chunks)} total chunks")
    
#     return vector_store



In [6]:
# Use this to create the vector store initially
vector_store = process_pdfs_to_vectorstore()

Processing Spark_Distributed_Data_Processing_Unit1.pdf...


100%|██████████| 77/77 [02:07<00:00,  1.65s/it]


Added 77 chunks from Spark_Distributed_Data_Processing_Unit1.pdf
Processing Spark_Distributed_Data_Processing_Unit2.pdf...


100%|██████████| 73/73 [02:06<00:00,  1.73s/it]


Added 73 chunks from Spark_Distributed_Data_Processing_Unit2.pdf
Processing Spark_Distributed_Data_Processing_Unit3.pdf...


100%|██████████| 74/74 [02:15<00:00,  1.82s/it]


Added 74 chunks from Spark_Distributed_Data_Processing_Unit3.pdf
Processing Spark_Distributed_Data_Processing_Unit4.pdf...


100%|██████████| 102/102 [03:21<00:00,  1.98s/it]


Added 102 chunks from Spark_Distributed_Data_Processing_Unit4.pdf
Processing Spark_Distributed_Data_Processing_Unit5.pdf...


100%|██████████| 136/136 [05:23<00:00,  2.38s/it]


Added 136 chunks from Spark_Distributed_Data_Processing_Unit5.pdf
Created persistent vector store with 462 total chunks


  vector_store.persist()


In [6]:
# Later, to load the existing vector store:
vector_store = Chroma(
    persist_directory="study_materials",
    embedding_function=embeddings
)

  vector_store = Chroma(


In [7]:
# Example query across all PDFs
retriever = vector_store.as_retriever()
# You can now query across all PDFs
results = retriever.invoke("What is the definition of a data lake?")

In [8]:
results

[Document(metadata={'page_number': 89, 'source': 'Spark_Distributed_Data_Processing_Unit4.pdf'}, page_content='Text: Spark Distributed Data Processing\nSpark Distributed Data Processing\nOutput Modes in Structured Streaming\nOutput Modes and Sink\nMemory Sink\n146\n Sensitivity: L&T EduTech and LTIMindtree Use only\nSpark Distributed Data Processing\nSpark Distributed Data Processing\nOutput Modes in Structured Streaming\nOutput Modes and Sink\nFile Sink\n• A transactional storage layer that combines the features of data lakes and data\nwarehouses, supporting ACID transactions on large datasets.\n• Supports append, update, and complete.\n• Best for applications needing reliable data storage with support for scalable and\natomic writes, such as slowly changing dimensions (SCD) or high-quality data\nlakes that are frequently queried.\n147 Sensitivity: L&T EduTech and LTIMindtree Use only\nSpark Distributed Data Processing\nSpark Distributed Data Processing\nOutput Modes in Structured Str

In [9]:
import os

In [10]:
def add_new_documents(new_pdf_paths):
    # Load existing vector store
    vector_store = Chroma(
        persist_directory="study_materials",
        embedding_function=embeddings
    )
    
    for pdf_path in new_pdf_paths:
        print(f"Processing {pdf_path}...")
        pdf_loader = PyPDFLoader(pdf_path)
        raw_documents = pdf_loader.load()
        raw_text = " ".join([doc.page_content for doc in raw_documents])
        
        chunks = split_text([raw_text])
        for chunk in chunks:
            chunk.metadata["source"] = os.path.basename(pdf_path)
            
        contextual_chunks = create_contextual_chunks(chunks, llm, raw_text)
        
        # Add new documents to existing store
        vector_store.add_documents(contextual_chunks)
    
    # Persist the updated store
    vector_store.persist()
    return vector_store

In [None]:
# pdf_names

['Security Essentials in Applied AI_Unit1.pdf',
 'Security Essentials in Applied AI_Unit2.pdf',
 'Security Essentials in Applied AI_Unit3.pdf',
 'Security Essentials in Applied AI_Unit4.pdf',
 'Security Essentials in Applied AI_Unit5.pdf',
 'LLM_Large_Language_Models_Unit1.pdf',
 'LLM_Large_Language_Models_Unit2.pdf',
 'LLM_Large_Language_Models_Unit3.pdf',
 'LLM_Large_Language_Models_Unit4.pdf']

In [None]:
# # Add new PDFs to existing store
# new_pdf_paths = pdf_names
# updated_store = add_new_documents(new_pdf_paths)

Processing Security Essentials in Applied AI_Unit1.pdf...


100%|██████████| 41/41 [00:58<00:00,  1.43s/it]


Processing Security Essentials in Applied AI_Unit2.pdf...


100%|██████████| 19/19 [00:22<00:00,  1.21s/it]


Processing Security Essentials in Applied AI_Unit3.pdf...


100%|██████████| 22/22 [00:26<00:00,  1.20s/it]


Processing Security Essentials in Applied AI_Unit4.pdf...


100%|██████████| 25/25 [00:32<00:00,  1.28s/it]


Processing Security Essentials in Applied AI_Unit5.pdf...


100%|██████████| 36/36 [00:49<00:00,  1.36s/it]


Processing LLM_Large_Language_Models_Unit1.pdf...


100%|██████████| 30/30 [00:37<00:00,  1.26s/it]


Processing LLM_Large_Language_Models_Unit2.pdf...


100%|██████████| 63/63 [01:44<00:00,  1.66s/it]


Processing LLM_Large_Language_Models_Unit3.pdf...


100%|██████████| 45/45 [01:03<00:00,  1.42s/it]


Processing LLM_Large_Language_Models_Unit4.pdf...


100%|██████████| 44/44 [00:59<00:00,  1.36s/it]


In [11]:
print(f"Total documents in store: {vector_store._collection.count()}")

Total documents in store: 1010


In [27]:
response = vector_store.similarity_search(query="GDPR",k=10)

In [28]:
response

[Document(metadata={'page_number': 2, 'source': 'Security Essentials in Applied AI_Unit4.pdf'}, page_content="Text: Compliance in Data Privacy and Cybersecurity\n \nSecurity Essentials in Applied AI Sensitivity: LNT Construction Internal Use\nSecurity Essentials in Applied AI\nKey Provisions of Major Data Privacy Regulations\nImage Source: techaheadcorp\n➢ Data privacy regulations are essential to protect individuals' personal information from \nmisuse, unauthorized access, and breaches. \n➢ They ensure organizations handle data responsibly and transparently, fostering trust \nbetween consumers and businesses. \n➢ Such regulations also mitigate risks of identity theft, fraud, and privacy violations, while \npromoting accountability in data management.\nData Privacy Regulations Sensitivity: LNT Construction Internal Use\nSecurity Essentials in Applied AI\nKey Provisions of Major Data Privacy Regulations\nImage Source: techaheadcorp\nRegion: European Union (EU) and European Economic Area

In [12]:
pdf_folders = ["DLGAI"]
pdf_files = []

for folder in pdf_folders:
    folder_path = os.path.join(os.getcwd(), folder)
    # Get only file names without folder paths
    folder_pdfs = [pdf for pdf in os.listdir(folder_path) if pdf.endswith('.pdf')]
    pdf_files.extend(folder_pdfs)

print(f"Found PDFs: {pdf_files}")

Found PDFs: ['Artificial_Intelligence_DL_GenAI_Unit1.pdf', 'Artificial_Intelligence_DL_GenAI_Unit2.pdf', 'Artificial_Intelligence_DL_GenAI_Unit3.pdf', 'Artificial_Intelligence_DL_GenAI_Unit4.pdf', 'Artificial_Intelligence_DL_GenAI_Unit5.pdf']


In [14]:
# Add new PDFs to existing store
new_pdf_paths = pdf_files
updated_store = add_new_documents(new_pdf_paths)

Processing Artificial_Intelligence_DL_GenAI_Unit1.pdf...


100%|██████████| 121/121 [04:50<00:00,  2.40s/it]


Processing Artificial_Intelligence_DL_GenAI_Unit2.pdf...


100%|██████████| 102/102 [03:53<00:00,  2.29s/it]


Processing Artificial_Intelligence_DL_GenAI_Unit3.pdf...


100%|██████████| 108/108 [04:17<00:00,  2.38s/it]


Processing Artificial_Intelligence_DL_GenAI_Unit4.pdf...


100%|██████████| 27/27 [00:33<00:00,  1.23s/it]


Processing Artificial_Intelligence_DL_GenAI_Unit5.pdf...


100%|██████████| 59/59 [01:33<00:00,  1.58s/it]
  vector_store.persist()


In [15]:
print(f"Total documents in store: {vector_store._collection.count()}")

Total documents in store: 1427
