In [82]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader,PyPDFLoader

import os


In [83]:
from langchain_community.document_loaders import PyPDFLoader

pdf_path = r"D:\New folder (2)\DS\RAG\ZuGraFix\data\text_files\aboutme.pdf"

# Load the single PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()

print(f"Loaded {len(documents)} document(s) from {pdf_path}")


Loaded 2 document(s) from D:\New folder (2)\DS\RAG\ZuGraFix\data\text_files\aboutme.pdf


In [84]:
def split_document(document, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return text_splitter.split_documents([document])

In [85]:
all_chunks = []

for doc in documents:   # documents has 2 pages
    chunks = split_document(doc)
    all_chunks.extend(chunks)

print(f"Total chunks from all pages: {len(all_chunks)}")


Total chunks from all pages: 5


In [86]:
cleaned_chunks = []
for chunk in all_chunks:
    text = chunk.page_content
    # Remove birthdates or patterns like '10th April 1992'
    import re
    text = re.sub(r'\d{1,2}(st|nd|rd|th)?\s[A-Z][a-z]+\s\d{4}', '[REDACTED]', text)
    # Create a new document chunk with cleaned text
    new_chunk = chunk
    new_chunk.page_content = text
    cleaned_chunks.append(new_chunk)

all_chunks = cleaned_chunks


In [87]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [88]:
class EmbeddingManager:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self._load_model()
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Loaded model: {self.model_name}")
            print(f"Model device: {self.model.device}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
    def generate_embedding(self, text: str) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded.")
        embedding = self.model.encode(text,show_progress_bar=True)
        print(f"Generated embedding of shape: {embedding.shape}")
        return embedding
    def get_embeddings_dimension(self) -> int:
        if not self.model:
            raise ValueError("Model not loaded.")
        return self.model.get_sentence_embedding_dimension()
   

In [89]:
embedding_manager = EmbeddingManager()
embedding_manager
embedding_manager.get_embeddings_dimension()

Loaded model: all-MiniLM-L6-v2
Model device: cpu


384

In [90]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.initialize_store()

    def initialize_store(self):
        try:
            # Make sure the persistence directory exists
            os.makedirs(self.persist_directory, exist_ok=True)

            # Initialize Chroma persistent client
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create the collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"Description": "PDF document embeddings for RAG"}
            )

            # Print confirmation
            print(f"Vector store initialized: {self.collection.name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
    def add_documents(self,documents:list[Any],embeddings:np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        print(f"Adding {len(documents)} documents to vector store...")
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collection now: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise e
        
vectorstore=VectorStore()
vectorstore



Vector store initialized: pdf_documents
Existing documents in collection: 10


<__main__.VectorStore at 0x1e504a35160>

In [91]:
chunks

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-12-22T09:34:00+00:00', 'source': 'D:\\New folder (2)\\DS\\RAG\\ZuGraFix\\data\\text_files\\aboutme.pdf', 'total_pages': 2, 'page': 1, 'page_label': '2'}, page_content='- Campaign optimization and ROI-focused strategies.\nGraphics & Animation:\n-  2D  animation,  motion  graphics,  logo  animation,  video  editing  (After  Effects,\nPhotoshop, Illustrator, Moho).\n- Design expertise: PDFs, PPTs, banners, marketing collaterals.\nProjects & Achievements:\n- Developed multiple ML/DL solutions and enterprise automation systems.\n- Built full-stack web applications, Chrome extensions, and React/Next.js projects.\n- Created Android apps including Object Detection, Tetromino Game, Weather App, and\nATS app.\n- Developed 4 Unity-based games and multiple games using pure JavaScript animation\nwith custom sprites.\n- In high school, made amendments in De Morgan’s Laws and received a medal via Intel\n/ Government of

In [92]:
texts = [chunk.page_content for chunk in all_chunks]
texts

['About Me\nMy name is Muhammad Zubair, born on [REDACTED], from Islamabad, Pakistan. I am\na highly skilled professional with a diverse educational and technical background — CA,\nBS in Physics, and extensive expertise in Machine Learning, Artificial Intelligence, Web\n&  Mobile  Development,  Game  Development,  Digital  Marketing,  Creative  Design,  and\nAnimation software such as Moho.\nWith over 6 years of professional experience on platforms like Fiverr and Upwork, I have\nsuccessfully  completed  50+  projects  for  diverse  clients,  consistently  maintaining  a\n5-star  rating.  My  work  spans  ML/DL  model  development,  full-stack  web  development,\nmobile  apps,  games,  cloud  deployment,  digital  marketing,  SEO/SEM,  and  creative\ndesign,  demonstrating  versatility  and  a  commitment  to  delivering  high-quality\nsolutions. Additionally, I have over 7 years of teaching experience in various schools,',
 'design,  demonstrating  versatility  and  a  commitment  to 

In [93]:
embeddings = embedding_manager.generate_embedding(texts)
vectorstore.add_documents(all_chunks, embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.88it/s]

Generated embedding of shape: (5, 384)
Adding 5 documents to vector store...
Successfully added 5 documents to vector store.
Total documents in collection now: 15





In [94]:
from typing import List, Dict, Any
import numpy as np

class RAGRetriever:
    def __init__(self, vector_store: "VectorStore", embedding_manager: "EmbeddingManager", top_k: int = 5):
        """
        Initialize the RAG retriever.
        
        Args:
            vector_store: Your vector store instance containing document embeddings.
            embedding_manager: The embedding manager (SentenceTransformer).
            top_k: Default number of documents to retrieve per query.
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        self.top_k = top_k

    def retrieve(self, query: str, score_threshold: float = 0.0, top_k: int = None) -> List[Dict[str, Any]]:
        """
        Retrieve top_k documents based on similarity to the query.
        
        Args:
            query: The query string.
            score_threshold: Minimum similarity score to include a document.
            top_k: Number of documents to retrieve for this query (optional, defaults to self.top_k).
        
        Returns:
            List of dictionaries containing document info and similarity scores.
        """
        top_k = top_k or self.top_k
        print(f"Retrieving top {top_k} documents for query: '{query}'")

        # Generate embedding for the query
        query_embedding = self.embedding_manager.generate_embedding([query])[0]

        try:
            # Query the vector store
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                include=['documents', 'metadatas', 'distances']
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]

                for i, (document, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                    # Normalize distance to similarity score (0..1)
                    similarity_score = 1 / (1 + distance)

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            "id": metadata.get("doc_index", i),
                            "content": document,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            "distance": distance,
                            "rank": i + 1
                        })

                        # Debug print: show snippet
                        print(f"Doc {i+1}, similarity={similarity_score:.3f}, snippet: {document[:75]}")

            if not retrieved_docs:
                print("No documents above threshold.")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []


In [95]:
# Create retriever
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

# Test query (replace with text from your PDF)
results = rag_retriever.retrieve("What is my name?")

print(f"\nRetrieved {len(results)} documents:")
for doc in results:
    print(f"Rank {doc['rank']}, Score: {doc['similarity_score']:.3f}, Content snippet: {doc['content'][:100]}")


Retrieving top 5 documents for query: 'What is my name?'


Batches: 100%|██████████| 1/1 [00:00<00:00, 45.26it/s]

Generated embedding of shape: (1, 384)
Doc 1, similarity=0.404, snippet: About Me
My name is Muhammad Zubair, born on 10th April 1992, from Islamaba
Doc 2, similarity=0.404, snippet: About Me
My name is Muhammad Zubair, born on 10th April 1992, from Islamaba
Doc 3, similarity=0.395, snippet: About Me
My name is Muhammad Zubair, born on [REDACTED], from Islamabad, Pa
Doc 4, similarity=0.365, snippet: - Analytical and critical thinker, solving problems efficiently.
-  Continu
Doc 5, similarity=0.365, snippet: - Analytical and critical thinker, solving problems efficiently.
-  Continu

Retrieved 5 documents:
Rank 1, Score: 0.404, Content snippet: About Me
My name is Muhammad Zubair, born on 10th April 1992, from Islamabad, Pakistan. I am
a highl
Rank 2, Score: 0.404, Content snippet: About Me
My name is Muhammad Zubair, born on 10th April 1992, from Islamabad, Pakistan. I am
a highl
Rank 3, Score: 0.395, Content snippet: About Me
My name is Muhammad Zubair, born on [REDACTED], from Islama




In [96]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
from typing import List, Dict, Any

# Load environment variables from .env
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

# Initialize the LLM
llm = ChatGroq(
    api_key=groq_api_key,
    model="llama-3.1-8b-instant",   # Replace with your desired Groq model
    temperature=0.1,
    max_tokens=1024
)

# Improved RAG function
def rag_simple(query: str, retriever, llm, top_k: int = 3) -> str:
    """
    Retrieve top_k documents from the retriever, create a context,
    and ask the LLM to answer the query based on that context.
    """
    # Retrieve relevant documents
    results = retriever.retrieve(query, top_k=top_k)
    
    if not results:
        return "No relevant context found to answer the question."

    # Prepare context from retrieved documents
    context = "\n\n".join([f"- {doc['content']}" for doc in results])

    # Build system + human prompt for better guidance
    messages = [
        ("system", "You are a precise and helpful assistant. Answer questions based on the provided context only. Do not make up information."),
        ("human", f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:")
    ]

    # Invoke the LLM
    response = llm.invoke(messages)

    return response.content


In [106]:
answer = rag_simple("github", rag_retriever, llm)
print(answer)


Retrieving top 3 documents for query: 'github'


Batches: 100%|██████████| 1/1 [00:00<00:00, 45.01it/s]

Generated embedding of shape: (1, 384)
Doc 1, similarity=0.396, snippet: practices.
Programming & Web/Mobile Development:
- Languages & frameworks: 
Doc 2, similarity=0.396, snippet: practices.
Programming & Web/Mobile Development:
- Languages & frameworks: 
Doc 3, similarity=0.396, snippet: practices.
Programming & Web/Mobile Development:
- Languages & frameworks: 





There is no mention of GitHub in the provided context.


In [113]:
def rag_advanced(query: str, retriever, llm, top_k: int = 5, min_score: float = 0.2, return_context: bool = False) -> dict:
    """
    Advanced RAG function:
    - Retrieves top-k relevant documents from the retriever
    - Constructs context for the LLM
    - Returns structured output with answer, sources, confidence, and optional context
    """
    # Retrieve relevant documents
    results = retriever.retrieve(query)
    if not results:
        return {
            "answer": "No relevant context found to answer the question.",
            "sources": [],
            "confidence": 0.0,
            "context": "" if return_context else None
        }

    # Filter by min_score
    results = [doc for doc in results if doc["similarity_score"] >= min_score]
    if not results:
        return {
            "answer": "No relevant context found above the confidence threshold.",
            "sources": [],
            "confidence": 0.0,
            "context": "" if return_context else None
        }

    # Prepare context and sources
    context = "\n\n".join([f"- {doc['content']}" for doc in results])
    sources = [{
        "source": doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        "page": doc['metadata'].get('page', 'unknown'),
        "score": doc['similarity_score'],
        "preview": doc['content'][:1000] + '...'
    } for doc in results]
    confidence = max(doc['similarity_score'] for doc in results)

    # Build prompt
    prompt = (
        "You are a precise and helpful assistant. Answer strictly based on the context below. "
        "Do not make up information. Keep the answer concise and relevant.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\nAnswer:"
    )

    # Invoke LLM
    response = llm.invoke([prompt])
    answer = response.content.strip()

    # Build output
    output = {
        "answer": answer,
        "sources": sources,
        "confidence": confidence
    }
    if return_context:
        output["context"] = context

    return output


In [119]:
answer = rag_advanced("does he plays cricket", rag_retriever, llm, top_k=5)
print(answer)


Retrieving top 5 documents for query: 'does he plays cricket'


Batches: 100%|██████████| 1/1 [00:00<00:00, 105.41it/s]

Generated embedding of shape: (1, 384)
Doc 1, similarity=0.412, snippet: - Analytical and critical thinker, solving problems efficiently.
-  Continu
Doc 2, similarity=0.412, snippet: - Analytical and critical thinker, solving problems efficiently.
-  Continu
Doc 3, similarity=0.412, snippet: - Analytical and critical thinker, solving problems efficiently.
-  Continu
Doc 4, similarity=0.370, snippet: - Campaign optimization and ROI-focused strategies.
Graphics & Animation:
-
Doc 5, similarity=0.370, snippet: - Campaign optimization and ROI-focused strategies.
Graphics & Animation:
-





{'answer': 'Yes, he plays cricket.', 'sources': [{'source': 'D:\\New folder (2)\\DS\\RAG\\ZuGraFix\\data\\text_files\\aboutme.pdf', 'page': 1, 'score': 0.41217047054880046, 'preview': '- Analytical and critical thinker, solving problems efficiently.\n-  Continuously  learning  emerging  technologies,  especially  in  AI  and  autonomous\nsystems.\n- Strategically skilled in digital marketing and campaign optimization.\n-  Experienced  educator  with  a  proven  track  record  of  7  years  teaching  in  schools,\ncolleges, and academies.\nHobbies & Interests:\n- Playing cricket, coding, and exploring AI advancements.\n- Reading books and solving complex problems.\n- Passionate about projects in advanced AI systems, driverless cars, and autonomous\ntechnologies.\n- LinkedIn: https://www.linkedin.com/in/muhammad-zubair-6230a1285/\n- GitHub: https://github.com/Muhammad-Zubair796\n- Active Phone: 03401071629...'}, {'source': 'D:\\New folder (2)\\DS\\RAG\\ZuGraFix\\data\\text_files\\aboutme