## Data Ingestion


LangChain document 
* page content(str)
* metadata (dict) 

In [13]:
from langchain_core.documents import Document
import os

In [14]:
doc = Document(
    page_content="this is the main content im using in rag paper",
    metadata ={
        "source":"example.txt",
        "pages":1,
        "author": "Robert Frost",
        "data-created": "2025-09-16"
    }
)

In [15]:
os.makedirs("../data/text_files", exist_ok=True)

In [16]:
sample_text = {
"../data/text_files/content.txt":""" Socompa is a large stratovolcano (composite volcano) on the border of Argentina and Chile.
    It has an elevation of 6,051 metres (19,852 ft) and is part of the Chilean and Argentine Andean Volcanic Belt (AVB). 
    Socompa is within the Central Volcanic Zone, one of the segments of the AVB, which contains about 44 active volcanoes. 
    It begins in Peru and runs first through Bolivia and Chile, and then Argentina and Chile. 
    Socompa lies close to the pass of the same name where the Salta-Antofagasta railway crosses the Chilean border.

    Most of the northwestern slope of Socompa collapsed catastrophically 7,200 years ago to form an extensive debris avalanche deposit. 
    The Socompa collapse is among the largest known on land with a volume of 19.2 cubic kilometres (4.6 cu mi) and a surface area of 490 square 
    kilometres (190 sq mi); its features are well-preserved by the arid climate. The deposit was at first considered to be either a moraine or a 
    pyroclastic flow deposit, until the 1980 eruption of Mount St. Helens prompted awareness of the instability of volcanic edifices and the 
    existence of large-scale collapses. There are large toreva blocks, which were left behind within the collapse crater. 
    After the landslide, the volcano was rebuilt by the effusion of lava flows and much of the scar is now filled in.
"""
}

In [17]:
for path, content in sample_text.items():
    with open(path, 'w', encoding="utf-8") as f:
        f.write(content)

In [18]:
from langchain.document_loaders import TextLoader

loader = TextLoader("../data/text_files/content.txt", encoding='utf-8')
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/content.txt'}, page_content=' Socompa is a large stratovolcano (composite volcano) on the border of Argentina and Chile.\n    It has an elevation of 6,051 metres (19,852 ft) and is part of the Chilean and Argentine Andean Volcanic Belt (AVB). \n    Socompa is within the Central Volcanic Zone, one of the segments of the AVB, which contains about 44 active volcanoes. \n    It begins in Peru and runs first through Bolivia and Chile, and then Argentina and Chile. \n    Socompa lies close to the pass of the same name where the Salta-Antofagasta railway crosses the Chilean border.\n\n    Most of the northwestern slope of Socompa collapsed catastrophically 7,200 years ago to form an extensive debris avalanche deposit. \n    The Socompa collapse is among the largest known on land with a volume of 19.2 cubic kilometres (4.6 cu mi) and a surface area of 490 square \n    kilometres (190 sq mi); its features are well-preserved by the arid climate. 

In [19]:
## Directory loader

from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob = "**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True
)
dir_loader.load()

100%|██████████| 1/1 [00:00<00:00, 961.78it/s]


[Document(metadata={'source': '..\\data\\text_files\\content.txt'}, page_content=' Socompa is a large stratovolcano (composite volcano) on the border of Argentina and Chile.\n    It has an elevation of 6,051 metres (19,852 ft) and is part of the Chilean and Argentine Andean Volcanic Belt (AVB). \n    Socompa is within the Central Volcanic Zone, one of the segments of the AVB, which contains about 44 active volcanoes. \n    It begins in Peru and runs first through Bolivia and Chile, and then Argentina and Chile. \n    Socompa lies close to the pass of the same name where the Salta-Antofagasta railway crosses the Chilean border.\n\n    Most of the northwestern slope of Socompa collapsed catastrophically 7,200 years ago to form an extensive debris avalanche deposit. \n    The Socompa collapse is among the largest known on land with a volume of 19.2 cubic kilometres (4.6 cu mi) and a surface area of 490 square \n    kilometres (190 sq mi); its features are well-preserved by the arid climat

In [20]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls= PyMuPDFLoader,
    show_progress=True
)

all_pdf_documents = dir_loader.load()

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


## Embedding and Vector DB

In [21]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example chunk
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [23]:
chunks = split_documents(all_pdf_documents)
print(len(chunks))

Split 11 documents into 43 chunks

Example chunk:
Content: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz...
Metadata: {'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\attention-is-all-you-need-Paper.pdf', 'file_path': '..\\data\\pdf\\attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0}
43


In [24]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loadding Embedding Model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise 

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        print(f"Generate embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generate embeddings with shape: {embeddings.shape}")
        return embeddings


# Initialise the embedding manager

embedding_manager = EmbeddingManager()

Loadding Embedding Model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


In [25]:
embedding_manager

<__main__.EmbeddingManager at 0x18afbb577d0>

## Vector Store

In [26]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory:str = "../data/vector_store") :
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.initialize_store()

    def initialize_store(self):
        try:
            # Create a persist chroma db client
            os.makedirs(self.persist_directory, exist_ok=True) 
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"description": "PDF document embedding for rag"}
            )
            print(f"Vectore store initialized. Collection: {self.collection_name}")
            print(f"Existent document in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding: {len(documents)} documents to vector store")

        # Prepare data for chromedb
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #Prepare the metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document Content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids = ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")


vector_store = VectorStore()

Vectore store initialized. Collection: pdf_documents
Existent document in collection: 0


In [27]:
vector_store

<__main__.VectorStore at 0x18afbe434d0>

In [28]:
## Convert the text to embeddings
text = [doc.page_content for doc in chunks]


# Generate the embeddings 
embeddings = embedding_manager.generate_embeddings(texts=text)

# store in the vector database
vector_store.add_documents(documents=chunks, embeddings=embeddings)
print("Collection count:", vector_store.collection.count())

Generate embeddings for 43 texts...


Batches: 100%|██████████| 2/2 [00:01<00:00,  1.53it/s]

Generate embeddings with shape: (43, 384)
Adding: 43 documents to vector store
Successfully added 43 documents to vector store
Total documents in collection: 43
Collection count: 43





## Retriever pipeline from Vector Store

In [29]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager) :
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k:int = 5, score_threshold:float = -1) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: {query}")
        print(f"Top k: {top_k}, Score Threshold: {score_threshold}")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings = [query_embedding.tolist()],
                n_results = top_k
            )

            #Process Results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in  enumerate(zip(ids, documents, metadatas, distances)):
                    # convert distance to similarity score(chromadb uses cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")

            return retrieved_docs
        except Exception as e:
            print("Error during query:", e)
            return []



rag_retriever = RAGRetriever(vector_store=vector_store, embedding_manager=embedding_manager)

In [30]:
rag_retriever.retrieve("what is attention?")

Retrieving documents for query: what is attention?
Top k: 5, Score Threshold: -1
Generate embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.31it/s]

Generate embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_c6fe3612_25',
  'content': 'convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer,\nthe approach we take in our model.\nAs side beneﬁt, self-attention could yield more interpretable models. We inspect attention distributions\nfrom our models and present and discuss examples in the appendix. Not only do individual attention\nheads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic\nand semantic structure of the sentences.\n5\nTraining\nThis section describes the training regime for our models.\n5.1\nTraining Data and Batching\nWe trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million\nsentence pairs. Sentences were encoded using byte-pair encoding [3], which has a shared source-\ntarget vocabulary of about 37000 tokens. For English-French, we used the signiﬁcantly larger WMT\n2014 English-French dataset consisting of 36M sentences and split tokens

## Integrate vectordb context pipeline with llm output

In [31]:
## Simple rag pipeline with groq llm
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

## initialize the groq llm
groq_api_key1 = os.getenv("GROQ_API_KEY1")

llm = ChatGroq(groq_api_key = groq_api_key1, model="gemma2-9b-it", temperature=0.1, max_tokens=1024)

##simple rag function: retrieve context + generate response
def rag_simple(query, retriever, llm, top_k = 3):
    ## retrieve the context
    results = retriever.retrieve(query, top_k = top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found"
    
    # generate the answer with llm
    prompt = f"""Use the following context to answer the question concisely.

        Context: {context}

        Question: {query}

        Answer:"""

    response = llm.invoke(prompt.format(context=context, query=query))
    return response.content

In [32]:
answer = rag_simple("what is attention?", rag_retriever, llm)
print(answer)

Retrieving documents for query: what is attention?
Top k: 3, Score Threshold: -1
Generate embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 138.28it/s]

Generate embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





Attention is a function that maps a query and a set of key-value pairs to an output, where the output is a weighted sum of the values, with the weights determined by the compatibility of the query with each key. 



In [33]:
test_prompt = "What is machine learning? Answer in one sentence."
response = llm.invoke(test_prompt)
print(f"   ✓ LLM working: {response.content[:100]}...")

   ✓ LLM working: Machine learning is the ability of computers to learn from data without being explicitly programmed....
