# Testing Notebook
RAG pipeline:
1. PDF → Markdown conversion
2. Text chunking
3. Embedding generation
4. Vector store indexing (ChromaDB)
5. RAG querying with Ollama

In [1]:
import sys
sys.path.append('..')

from src.pdf_processor import PDFProcessor
from src.image_summarizer import get_image_summaries
from src.chunker import TextChunker
from src.embedder import Embedder
from src.vector_store import ChromaVectorStore
from src.rag_engine import RAGEngine



## 1. Conversion PDF - Markdown (Docling)

In [None]:
processor = PDFProcessor(
    do_ocr=False, # for now
    do_table_structure=True,
    generate_images=True # For now
)

# Convert PDF 
pdf_path = "../data/pdfs/12-Cancer-du-cavum-2025-QCMed.docx"
output_path = "../data/processed/12-Cancer-du-cavum-2025-QCMed.md"

# Get image summaries ()
image_summaries = get_image_summaries()

# Convert
markdown_text = processor.convert_to_markdown(
    pdf_path=pdf_path,
    output_path=output_path,
    image_summaries=image_summaries
)

# Partie mezelet lezemha trouchik 


Resolved path: /Users/ahmedamdouni/sss/multimodal_rag_QCMed_demo/data/pdfs/cours_1.pdf
Exists: True
Number of pages: 44
Saved markdown to: ../data/processed/cours_1.md


## 2. Chunking

In [None]:
# Initialize chunker
chunker = TextChunker(
    chunk_size=1024,
    chunk_overlap=100
)

# Chunk the text
chunks = chunker.chunk_text(markdown_text)

print(f"Number of chunks: {len(chunks)}")
print(f"\nFirst chunk preview:\n{chunks[0][:300]}...")

# Get chunking stats
stats = chunker.get_stats(markdown_text)
print(f"\nChunking stats: {stats}")

## 3. Embeddings

In [None]:
embedder = Embedder(
    model_name="nomic-ai/nomic-embed-text-v1.5",
    batch_size=32
)

embeddings = embedder.embed_texts(chunks, show_progress=True)

print(f"\nNumber of embeddings: {len(embeddings)}")
print(f"Embedding dimension: {len(embeddings[0])}")

## Step 4: Index into ChromaDB

In [None]:
vector_store = ChromaVectorStore(
    collection_name="cours_residanat",
    persist_directory="../chroma_db",
    reset_collection=True 
)

vector_store.add_documents(
    texts=chunks,
    embeddings=embeddings
)

stats = vector_store.get_stats()
print(f"\nVector store stats: {stats}")

## 5. Test RAG

In [None]:
test_questions = {
    1: "Quels sont les critères majeurs de Duke pour l’endocardite infectieuse ?",
    2: "Quelles sont les complications neurologiques de l’endocardite infectieuse ?",
    3: "Qu’est-ce qu’une endocardite infectieuse en termes simples ?",
    4: "Pourquoi les toxicomanes IV sont-ils à risque d’endocardite ?",
    5: "Quelle est la posologie de la vancomycine dans ce document ?"
}

In [None]:
rag = RAGEngine(
    vector_store=vector_store,
    embedder=embedder,
    llm_model="llama3:latest" # Local model for now,
    top_k=3
)

In [None]:
# Testss
query1 = test_questions[1]

result1 = rag.query(query1, return_context=True)

print(f"Query: {query1}\n")
print(f"Answer:\n{result1['answer']}\n")

# context retrieved
print("Retrieved Documents:\n")
for i, doc in enumerate(result1['retrieved_docs'], 1):
    print(f"Document {i} (distance: {doc['distance']:.4f})")
    print(f"{doc['text'][:300]}...\n")


## 7. Streaming discussion

In [None]:
# Stream a response
query =  test_questions[1]

print(f"Query: {query}\n")
print("Streaming answer:")

for token in rag.stream_query(query):
    print(token, end='', flush=True)

In [None]:
# Interactive querying (optional - uncomment to use)
while True:
     query = input("\nEnter your query (or 'quit' to exit): ")
     if query.lower() in ['quit', 'exit', 'q']:
         break
     
     result = rag.query(query)
     print(f"\nAnswer:\n{result['answer']}")