In [121]:
from langchain.document_loaders import DirectoryLoader
import os
import numpy as np
from dotenv import load_dotenv
load_dotenv()
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI

python-dotenv could not parse statement starting at line 4


In [122]:
os.makedirs("data", exist_ok=True)

In [123]:
doc = DirectoryLoader(
    path='data',
    glob='*.pdf',
    loader_cls= PyMuPDFLoader,
    show_progress=True
)

doc_loader = doc.load()
doc_loader

100%|██████████| 2/2 [00:00<00:00, 19.33it/s]


[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'data\\attention-is-all-you-need-Paper.pdf', 'file_path': 'data\\attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszka

# Chunking

In [124]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 500):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents=documents)
    print(f"Splitting {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print("\nExample chunk")
        print(f"Content: {split_docs[0].page_content[:101]}...")

    return split_docs

In [125]:
chunks = split_documents(doc_loader)

Splitting 15 documents into 71 chunks

Example chunk
Content: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain...


In [126]:
chunks

[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'data\\attention-is-all-you-need-Paper.pdf', 'file_path': 'data\\attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszka

# Embedding and VectorDB

In [127]:
class EmbeddingManager:
    def __init__(self, model_name:str = 'all-MiniLM-L6-v2') :
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embedding model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print("Model loaded successfully")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        print(f"Generating embeddings for {len(texts)}")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Embeddings generated: {embeddings.shape}")

        return embeddings

embedding_manager = EmbeddingManager()

Loading embedding model all-MiniLM-L6-v2
Model loaded successfully


In [128]:
embedding_manager

<__main__.EmbeddingManager at 0x24424dc00b0>

# Vector Store

In [129]:
class VectorStore:
    def __init__(self, collection_name: str="pdf_docs", persist_directory:str = "data"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._init_store()
    
    
    # Initialize vector store
    def _init_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={'description' : 'PDF document embedding for rag'}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existent document in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")


    # Add documents
    def add_documents(self, documents:List[Any], embedding: np.ndarray):
        if len(documents) != len(embedding):
            raise ValueError("Length of documents must match number of embedding")
        
        print(f"Adding {len(documents)}")

        #prepare data for chromadb
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embedding)):
            #Generate unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #Prepare the metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)

            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids = ids,
                embeddings = embeddings_list,
                metadatas = metadatas,
                documents = documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")


vector_store = VectorStore()

python-dotenv could not parse statement starting at line 4


Vector store initialized. Collection: pdf_docs
Existent document in collection: 203


In [130]:
uuid.uuid4()

UUID('d0b773c4-25fe-4bf7-9a09-4500fc99606f')

In [131]:
text = [doc.page_content for doc in chunks]

embeddings = embedding_manager.generate_embeddings(texts=text)

vector_store.add_documents(documents=chunks, embedding=embeddings)
print(f"Collection count: {vector_store.collection.count()}")

Generating embeddings for 71


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches: 100%|██████████| 3/3 [00:03<00:00,  1.01s/it]


Embeddings generated: (71, 384)
Adding 71
Successfully added 71 documents to vector store
Collection count: 274


# Retriever Pipeline from vector store

In [132]:
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager:EmbeddingManager) -> None:
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query:str, top_k:int = 5, score_threshold:float=-1.0) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: {query}")
        print(f"Top_k: {top_k}, score_threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents , metadatas, distances)):
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id':doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            return retrieved_docs

        except Exception as e:
            print(f"Error during query: {e}")
            return []

rag_retriever = RAGRetriever(vector_store=vector_store, embedding_manager=embedding_manager)

In [133]:
rag_retriever.retrieve("what is C-NMC dataset is about")

Retrieving documents for query: what is C-NMC dataset is about
Top_k: 5, score_threshold: -1.0
Generating embeddings for 1


Batches: 100%|██████████| 1/1 [00:00<00:00, 60.02it/s]

Embeddings generated: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_1ffac0f6_61',
  'content': 'C-NMC Dataset \nAim: Classification of leukemic B-lymphoblast cells (cancer cells) from normal B-lymphoid \nprecursors (normal cells) from blood smear microscopic images. \nA dataset of cells with labels (normal versus cancer) is provided to train machine learning-based \nclassifier to identify normal cells from leukemic blasts (malignant/cancer cells). These cells have \nbeen segmented from the microscopic images. These images are representative of images in \nthe real-world because these contain some staining noise and illumination errors, although \nthese errors have largely been fixed by us via our own in-house method of stain color \nnormalization. \nThe ground truth has been marked by an expert oncologist.  \nThis dataset was also used for our IEEE ISBI 2019 conference challenge: Classification of \nNormal vs Malignant Cells in B-ALL White Blood Cancer Microscopic Images. The challenge is \navailable here: \nhttps://biomedicalimaging.org/2

# Integrate LLM

In [138]:
groq_api_key = os.getenv('GROQ_API_KEY')

llm = ChatGroq(groq_api_key = groq_api_key, model="gemma2-9b-it", temperature=0.1, max_tokens=1024)

def rag(query, retriever, llm, top_k = 3):
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "no relevant context found"

    # generate the answer with llm
    prompt = f"""Use the following context to answer the question concisely.

        Context: {context}

        Question: {query}

        Answer:"""
    response = llm.invoke(prompt.format(context=context, query=query))
    return response.content

In [139]:
ans = rag("what is C-NMC dataset is about'", retriever=rag_retriever, llm=llm)
print(ans)

Retrieving documents for query: what is C-NMC dataset is about'
Top_k: 3, score_threshold: -1.0
Generating embeddings for 1


Batches: 100%|██████████| 1/1 [00:00<00:00, 31.39it/s]

Embeddings generated: (1, 384)
Retrieved 3 documents (after filtering)





The C-NMC dataset is about classifying leukemic B-lymphoblast cells (cancer cells) from normal B-lymphoid precursors (normal cells) in blood smear microscopic images.  

