Import Langchain Libraries.

In [2]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

Specify path for pdfs and chroma database

In [3]:
DATA_PATH = "data"
CHROMA_PATH = "chroma"

Load the pdf(s) with the help of PyPDFDirectoryLoader

In [4]:
def load_documents():
    document_loader= PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

documents = load_documents()
print(documents[0])


page_content='UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2021
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from _________ to _________
Commission File Number: 001-34756
Tesla, Inc.
(Exact name of registrant as specified in its charter)
  
Delaware  91-2197729
(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)
  
13101 Tesla RoadAustin, Texas   78725
(Address of principal executive offices)  (Zip Code)
(512) 516-8177
(Registrant’s telephone number, including area code)
Securities registered pursuant to Section 12(b) of the Act:
  
Title of each class Trading Symbol(s) Name of each exchange on which registered
Common stock TSLA The Nasdaq Global Select Market
 
Securities registered pursuant to

Now that the document is created, split the texts into number of chunks.

In [5]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

documents = load_documents()
chunks = split_documents(documents)
print(chunks[0])

page_content='UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended December 31, 2021
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from _________ to _________
Commission File Number: 001-34756
Tesla, Inc.
(Exact name of registrant as specified in its charter)
  
Delaware  91-2197729
(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)
  
13101 Tesla RoadAustin, Texas   78725
(Address of principal executive offices)  (Zip Code)
(512) 516-8177
(Registrant’s telephone number, including area code)
Securities registered pursuant to Section 12(b) of the Act:
  
Title of each class Trading Symbol(s) Name of each exchange on which registered
Common stock TSLA The Nasdaq Global Select Market' metadata={'producer': 'Skia/PDF m1

After the chunks are created, establish embedding for them to inculcate into database.

In [6]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma


class TextEmbedder:
    def __init__(self, model_name="nomic-embed-text"):
        self.embeddings = OllamaEmbeddings(model=model_name)
    
    def embed_query(self, query):
        # Assuming OllamaEmbeddings has an embed method, you can call it here
        return self.embeddings.embed(query)
    
    def embed_documents(self, documents):
        return self.embeddings.embed_documents(documents)

# Now use this class in your Chroma setup
embedding_function = TextEmbedder()  # Use the default model "nomic-embed-text"
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)


VectorDB function if there is no need for addition or updation of PDF. 

In [7]:
from langchain_community.vectorstores import Chroma
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=embedding_function()
    )
    

Creating individual ID's for chunks in the format of source/page number/chunks Index. This solves the problem of having to create new database when just wanting to add content/update pdf files.

In [8]:
def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0
    #first we gather source and page from all chunks to make a simple page/source ID
    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"
        #since several chunks share the same page ID, we create a condition for chunk index count
        #this defines if the page id is same, increase the chunk index count
        if current_page_id == last_page_id:
            current_chunk_index += 1
        #this defines if the page is different, reset the chunk index to 0
        else:
            current_chunk_index = 0
        #Unique ID in desired format
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id
        

        # Add it to the page meta-data as an element
        chunk.metadata["id"] = chunk_id
    return chunks
#This creates the desired chunks with ID

Process the chunks by calling the id creation function.

In [9]:

chunks_with_ids = calculate_chunk_ids(chunks)



Checking the first chunk to see if it gave us the desired format

In [10]:

chunk_to_check = chunks_with_ids[0]  
print(f"Chunk ID: {chunk_to_check.metadata['id']}, Metadata: {chunk_to_check.metadata}")


Chunk ID: data\10-K.pdf:0:0, Metadata: {'producer': 'Skia/PDF m138', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', 'creationdate': '2025-07-08T03:34:30+00:00', 'title': '10-K', 'moddate': '2025-07-08T03:34:30+00:00', 'source': 'data\\10-K.pdf', 'total_pages': 120, 'page': 0, 'page_label': '1', 'id': 'data\\10-K.pdf:0:0'}


Document updation function if new PDFs are added

In [11]:
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=embedding_function
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # This is only to add or update more chunks 
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}") #length must be 0 if run first without existing DB

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
    #Runs if there are new chunks
    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        #create new unique IDs for new chunks
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        #Adds new chunk along with its Ids to the database
        db.add_documents(new_chunks, ids=new_chunk_ids)
        #for saving and future use 
        db.persist()
    else:
        print("✅ No new documents to add")


Argument parse for terminal handling 

In [12]:
import os
import shutil 
def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [13]:

def main():

    
    documents = load_documents()
    chunks = split_documents(documents)
    add_to_chroma(chunks)


    

In [14]:
if __name__ == "__main__":
    main()

  db = Chroma(


Number of existing documents in DB: 4
👉 Adding new documents: 576


  db.persist()


Local RAG

In [16]:
import argparse
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import OllamaLLM
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from embed_function import text_embed

# Define paths and constants
CHROMA_PATH = "chroma"

# Prompt templates
QUERY_PROMPT_TEMPLATE = """
You are an AI assistant tasked with improving the retrieval of relevant documents. 
Generate five different versions of the given user question to maximize the relevance of retrieved documents.
Original question: {question}
"""

RAG_PROMPT_TEMPLATE = """
Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

def query_rag_with_multiquery(query_text: str):
    # Load the Chroma vector database
    embedding_function = text_embed()
    vector_db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    
    # Initialize the LLM
    llm = OllamaLLM(model="mistral-openorca")
    
    query_prompt = PromptTemplate(
        input_variables=["question"],
        template=QUERY_PROMPT_TEMPLATE,
    )
    
    # Initialize the retriever
    retriever = MultiQueryRetriever.from_llm(
        vector_db.as_retriever(),
        llm,
        prompt=query_prompt,
    )

    # Generate the context using the retriever
    documents = retriever.invoke(query_text)

    context_text = "\n\n---\n\n".join([doc.page_content for doc in documents])

    # Create the RAG prompt
    rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
    final_prompt = rag_prompt.format(context=context_text, question=query_text)

    # Invoke the LLM with the RAG prompt
    response = llm.invoke(final_prompt)
    return response


# Test the query RAG with multiquery function directly in the notebook
query_text = "What does the risk factor indicate?"

# Call the function to get a response
response = query_rag_with_multiquery(query_text)
print("\n--- Final Response ---\n")
print(response)



--- Final Response ---

 The risk factor indicates that there are various risks and uncertainties associated with the company's business, financial condition, and future results. These risks include the impact of COVID-19 pandemic, foreign currency risk, supply risk, inventory valuation risk, maintaining and expanding international operations, and implementation of new systems. The risks described in this report are not exhaustive, as there might be other material risks not currently known to the company or deemed immaterial at the time.
