### 1. RAG pipeline - Data Ingestion to Vector DB Pipeline

In [6]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [7]:
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\n Processing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source information to metadata
            for doc in documents:
                doc.metadata["source file"] = pdf_file.name
                doc.metadata["file_type"] = 'pdf'
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents


# Process all PDFs 
all_pdf_documents = process_all_pdfs("../data")

Found 5 PDF files to process

 Processing: AI researcher.pdf


Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)


Loaded 2 pages

 Processing: apple_jd.pdf
Loaded 1 pages

 Processing: Applied Data Scientist.pdf
Loaded 2 pages

 Processing: Data Engineer.pdf
Loaded 2 pages

 Processing: DNN Developer.pdf
Loaded 2 pages

Total documents loaded: 9


In [9]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2025-09-19T19:17:00+09:00', 'author': 'AtoJ-Ruchira', 'moddate': '2025-09-19T19:17:00+09:00', 'title': 'IITçŽ»é„²çﬂ¨Job Description(2025-2026)0903æ‘’å⁄º.xlsx', 'source': '..\\data\\pdf\\AI researcher.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source file': 'AI researcher.pdf', 'file_type': 'pdf'}, page_content='Honda Motor Co., \nLtd. AI Research Engineer\nHonda is a total mobility company dedicated to delivering the "joy of free movement." We innovate in various fields, including motorcycles, automobiles, power products, aircraft and aircraft \nengines, and robotics. Honda operates in markets across about 200 countries worldwide, continuously creating next-generation mobility solutions and products that improve people’s lives by \ncombining our technology and expertise.\nAt Honda, everyone, regardless of nationality, gender, educational background, or experience, can freely expr

#### 2. Chunking

**Chunking the documents is done soley because embedding models have a context size. We can only feed them data in smaller segments**

In [None]:
### Text splitting get into chunks 

def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
    """Split documents into smaller chunks for better RAG performance"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [11]:
chunks = split_documents(all_pdf_documents)
chunks

Split 9 documents into 49 chunks

Example chunk:
Content: Honda Motor Co., 
Ltd. AI Research Engineer
Honda is a total mobility company dedicated to delivering the "joy of free movement." We innovate in various fields, including motorcycles, automobiles, pow...
Metadata: {'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2025-09-19T19:17:00+09:00', 'author': 'AtoJ-Ruchira', 'moddate': '2025-09-19T19:17:00+09:00', 'title': 'IITçŽ»é„²çﬂ¨Job Description(2025-2026)0903æ‘’å⁄º.xlsx', 'source': '..\\data\\pdf\\AI researcher.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source file': 'AI researcher.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2025-09-19T19:17:00+09:00', 'author': 'AtoJ-Ruchira', 'moddate': '2025-09-19T19:17:00+09:00', 'title': 'IITçŽ»é„²çﬂ¨Job Description(2025-2026)0903æ‘’å⁄º.xlsx', 'source': '..\\data\\pdf\\AI researcher.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source file': 'AI researcher.pdf', 'file_type': 'pdf'}, page_content='Honda Motor Co., \nLtd. AI Research Engineer\nHonda is a total mobility company dedicated to delivering the "joy of free movement." We innovate in various fields, including motorcycles, automobiles, power products, aircraft and aircraft \nengines, and robotics. Honda operates in markets across about 200 countries worldwide, continuously creating next-generation mobility solutions and products that improve people’s lives by \ncombining our technology and expertise.\nAt Honda, everyone, regardless of nationality, gender, educational background, or experience, can freely expr

#### 3. Embedding and VectorStore DB

In [12]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loader successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts....")
        embeddings = self.model.encode(texts, show_progress_bar = True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
## Initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager


Loading embedding model: all-MiniLM-L6-v2


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loader successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x19c7fdb2f90>

#### 4. Vector Store

In [None]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vectore store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vectore_store"):
        """
        Initialize the vectore store

        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vectore store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata = {"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids = ids,
                embeddings = embeddings_list,
                metadatas = metadatas,
                documents = documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore


Vector store initialized Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x19c001ada90>

In [21]:
### convert the text to embeddings
texts = [doc.page_content for doc in chunks]

### Generate the embeddings
embeddings = embedding_manager.generate_embeddings(texts)

### Store in vector db
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 49 texts....


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches: 100%|██████████| 2/2 [00:01<00:00,  1.45it/s]

Generated embeddings with shape: (49, 384)
Adding 49 documents to vector store
Successfully added 49 documents to vector store
Total documents in collection: 49





#### 5. RAG retreiver

In [24]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever

        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vectore_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int=5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold

        Returns: 
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vectore_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            # Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content':document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })
                    print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
                else:
                    print("No documents found")
                
                return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever = RAGRetriever(vectorstore, embedding_manager)


In [25]:
rag_retriever

<__main__.RAGRetriever at 0x19c013157f0>

In [28]:
rag_retriever.retrieve("what does honda want?")

Retrieving documents for query: 'what does honda want?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 92.88it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Retrieved 2 documents (after filtering)
Retrieved 3 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 5 documents (after filtering)
No documents found





[{'id': 'doc_870a1404_1',
  'content': 'We believe that having dreams and staying passionate about making them a reality is the reason Honda exists. "It\'s not about what happens; it\'s about what you do." You are the ones who will shape \nthe future of the world. As you move forward in life, we hope you will be the main character in the moments when the world changes.\nIf you have a desire to achieve something, to create something, or to make your dreams come true, why not realize those aspirations at Honda?\n〈Software Defined Vehicle Business Development Unit, Digital Engine Development Division〉\nAt Honda, The Power of Dreams inspires us to create intelligent, connected products that enhance mobility and bring joy to people’s lives. Our goal is to deliver affordable, AI-powered solutions that \nmake customers smile—by combining cutting-edge technology, ethical transparency, and collaborative innovation. The automotive industry is undergoing a profound transformation. Vehicles are',


**We can now get any context ranked for a query as you can see above**

#### 6. Integrating VectorDB context pipeline with LLM output

In [None]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(groq_api_key = groq_api_key, model_name ="llama-3.1-8b-instant", temperature = 0.1, max_tokens = 1024)

## 2. Simple RAG function: retrieve context +  generate response

def rag_simple(query, retriever, llm, top_k = 3):
    ## retrieve the context 
    results = retriever.retrieve(query, top_k = top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    ## generate the answer using groq LLM
    prompt = f"""Use the following context to answer the question concisely.
            context:
            {context}

            Question: {query}

            Answer:"""
    
    response = llm.invoke([prompt.format(context = context, query = query)])
    return response.content

In [39]:
answer = rag_simple("AI research engineer role", rag_retriever, llm)
print(answer)

Retrieving documents for query: 'AI research engineer role'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 124.13it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Retrieved 2 documents (after filtering)
Retrieved 3 documents (after filtering)
No documents found





AI Research Engineer Role: 

This role involves implementing advanced analytics and automation functionalities, including ML and Gen-AI techniques for intelligent data analysis, and collaborating with stakeholders to drive innovation in the field of mobility.


#### 7. Enhanced RAG pipeline features

In [42]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence, score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k = top_k, score_threshold = min_score)

    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:120] + '...'
    } for doc in results]
    confidence = max(doc['similarity_score'] for doc in results)

    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}:\n\nQuestion: {query}\n\nAnswer:{answer}"""
    response = llm.invoke([prompt.format(context = context, query = query)])

    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example output
result = rag_advanced("How to prepare for AI engineer role?", rag_retriever, llm, top_k = 10, min_score = 0.0, return_context = True)
print("Answer:", result['answer'])
print("Source:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'How to prepare for AI engineer role?'
Top K: 10, Score threshold: 0.0
Generating embeddings for 1 texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 105.77it/s]

Generated embeddings with shape: (1, 384)
Retrieved 1 documents (after filtering)
Retrieved 2 documents (after filtering)
Retrieved 3 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 4 documents (after filtering)
Retrieved 4 documents (after filtering)
No documents found





Answer: To prepare for an AI Engineer role, focus on the following:

1. **Education**: Pursue a Master's or Dual Degree in Data Science and AI, Computer Science, or Information Science with a focus on AI/ML, GenAI, and Analytics.
2. **Technical Skills**:
	* Develop expertise in image processing, image recognition technology development, recognition technology using RADAR or LiDAR, Generative AI, and large language models (LLMs).
	* Familiarize yourself with foundational AI technologies like machine learning and deep learning, and AI-related tools such as Caffe, Chainer, TensorFlow, and PyTorch.
	* Learn data analysis techniques like statistical analysis and multivariate analysis.
3. **Programming Languages**: Master R, Python, and SQL, and familiarize yourself with data analysis/AI libraries like numpy, scipy, scikit-learn, Jupyter Notebook, and PyTorch.
4. **Simulation Tools**: Learn Matlab/Simulink, CarMaker, and Carla for simulation and modeling.
5. **Soft Skills**:
	* Develop stron