# Working RAG Pipeline


In [1]:
import sys
from pathlib import Path

project_root = str(Path.cwd().parent.parent)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
from abc import ABC, abstractmethod
from typing import List
import ollama
from src.utils.config import LLMConfig, settings
from src.shared.models import SearchResult
from src.utils.logger import logger

## Ollama Manager

In [None]:
class OllamaManager:
    @staticmethod
    def ensure_ready(model_name: str = "llama3.2"):
        try:
            models = ollama.list()
            if not any(model_name in m['name'] for m in models['models']):
                logger.info(f" Downloading {model_name}...")
                ollama.pull(model_name)
            logger.info(f" {model_name} ready")
            return True
        except Exception as e:
            logger.info(f"  Ollama not running!")
            logger.info(f"   Run: ollama serve")
            logger.info(f"   Then: ollama pull {model_name}")
            return False

## Generator

In [4]:
class BaseGenerator(ABC):
    @abstractmethod
    def generate(self, prompt: str) -> str:
        pass

class OllamaGenerator(BaseGenerator):
    def __init__(self, config: LLMConfig, auto_setup: bool = True):
        self.model = config.model_name
        self.temperature = config.temperature
        if auto_setup:
            OllamaManager.ensure_ready(self.model)
    
    def generate(self, prompt: str) -> str:
        try:
            response = ollama.chat(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": self.temperature}
            )
            return response['message']['content']
        except Exception as e:
            return f"Error: {e}"

## Query Constructor

In [5]:
class QueryConstructor(ABC):
    @abstractmethod
    def refine_query(self, query: str) -> list[str]:
        pass

class MultiQueryConstructor(QueryConstructor):
    def __init__(self, generator: BaseGenerator) -> None:
        self.generator = generator
        self.template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries), one per line:"""

    def refine_query(self, query: str) -> list[str]:
        prompt = self.template.format(question=query)
        response = self.generator.generate(prompt)
        queries = [q.strip() for q in response.split('\n') if q.strip() and len(q.strip()) > 10]
        return [query] + queries[:3]

## Answer Generator

In [6]:
class BaseQueryAnswerer(ABC):
    @abstractmethod
    def answer(self, result_search: List[SearchResult], query: str) -> str:
        pass

class QueryAnswerer(BaseQueryAnswerer):
    def __init__(self, generator: BaseGenerator) -> None:
        self.generator = generator
        self.template = """Answer this question using only the context below.

Context:
{context}

Question: {question}

Answer:"""
    
    def answer(self, result_search: List[SearchResult], query: str) -> str:
        if not result_search:
            return "No relevant documents found."
        
        context_parts = [f"[{i}] {r.content}" for i, r in enumerate(result_search, 1)]
        context = "\n\n".join(context_parts)
        print(context)
        prompt = self.template.format(context=context, question=query)
        return self.generator.generate(prompt).strip()

## RAG Pipeline (SIMPLE VERSION)

In [7]:
from src.ingestion.vector_store.stores import ChromaStore

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
class SimpleRAGPipeline:
    def __init__(self, vector_store: ChromaStore, answerer: BaseQueryAnswerer):
        self.vector_store = vector_store
        self.answerer = answerer
    
    def query(self, user_query: str, top_k: int = 5) -> str:
        print(f"üîç Searching for: {user_query}")
        
        results = self.vector_store.query([user_query], n_result=top_k)
        print(results)
        print(type(results))
        print(type(results[0]))
        
        print(f" Found {len(results)} results")
        
        answer = self.answerer.answer(results, user_query)
        return answer


class MultiQueryRAGPipeline:
    def __init__(
        self,
        query_constructor: QueryConstructor,
        vector_store: ChromaStore, 
        answerer: BaseQueryAnswerer
    ):
        self.query_constructor = query_constructor
        self.vector_store = vector_store
        self.answerer = answerer
    
    def query(self, user_query: str, top_k: int = 10) -> str:
        queries = self.query_constructor.refine_query(user_query)
        print(f" Using {len(queries)} query variations")
        print(queries)
        
        results = self.vector_store.query(queries, n_result=top_k)
        
        print(f" Found {len(results)} total results")
        
        top_results = results[:top_k]
        
        answer = self.answerer.answer(top_results, user_query)
        return answer

## Test Simple Version First

In [9]:
from src.ingestion.parsers.get_parser import get_parser

In [10]:
parser = get_parser()

In [11]:
pdf_path = Path("../../data/Word2Vec.pdf")

import os

print(f"File exists: {os.path.exists(pdf_path)}")

File exists: True


In [12]:
from src.ingestion.chunking.get_chunker import get_chunker

In [13]:
chunker = get_chunker()

In [14]:
parsed_doc = parser.parse(pdf_path=pdf_path)
chunked_doc = chunker.chunk(parsed_doc)

[32m2026-01-31 14:13:36[0m | [1mINFO    [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m22[0m - [1mStarting to parse PDF: ../../data/Word2Vec.pdf[0m
[32m2026-01-31 14:13:46[0m | [1mINFO    [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m42[0m - [1mDocument converted successfully: 12 pages[0m
[32m2026-01-31 14:13:46[0m | [1mINFO    [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m67[0m - [1mStructure extracted: 23 chapters[0m
[32m2026-01-31 14:13:46[0m | [32m[1mSUCCESS [0m | [36msrc.ingestion.parsers.parsers[0m:[36mparse[0m:[36m69[0m - [32m[1mSuccessfully parsed Word2Vec.pdf[0m


In [15]:
from src.ingestion.embedding.get_embbedder import get_embedder

In [16]:
embedder = get_embedder()

[32m2026-01-31 14:13:46[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m19[0m - [1mLoading SentenceTransformer model: all-MiniLM-L6-v2[0m


[32m2026-01-31 14:13:50[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m27[0m - [1mModel loaded: 384d on cpu[0m


In [17]:
embeddings = embedder.embed_chunk(chunks=chunked_doc)

[32m2026-01-31 14:13:50[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.base_embed[0m:[36membed_chunk[0m:[36m62[0m - [1mSuccessfully embedded 33 chunks[0m


In [18]:
from src.utils.config import settings

print(" Initializing...")

# Create components
generator = OllamaGenerator(settings.llm, auto_setup=True)
answerer = QueryAnswerer(generator)
vector_store = ChromaStore(settings.vector_store)

# IMPORTANT: Clear old data that has duplicate references
print("üßπ Clearing old data from vector store...")
vector_store.clear()

# Now ingest with the new deduplicated content
vector_store.ingest(embch=embeddings)
# Simple pipeline (no query enhancement)
simple_rag = SimpleRAGPipeline(
    vector_store=vector_store,
    answerer=answerer
)

print(" Simple RAG ready!")

[32m2026-01-31 14:13:50[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m24[0m - [1mcreating or getting the collection[0m
[32m2026-01-31 14:13:50[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m30[0m - [1mgetting the embedder[0m
[32m2026-01-31 14:13:50[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m19[0m - [1mLoading SentenceTransformer model: all-MiniLM-L6-v2[0m


 Initializing...
  Ollama not running!
   Run: ollama serve
   Then: ollama pull llama3.2


[32m2026-01-31 14:13:55[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m27[0m - [1mModel loaded: 384d on cpu[0m
[32m2026-01-31 14:13:55[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mclear[0m:[36m100[0m - [1mClearing collection 'technical_books'[0m


üßπ Clearing old data from vector store...


[32m2026-01-31 14:13:56[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mclear[0m:[36m105[0m - [1mCleared 396 documents from collection[0m
[32m2026-01-31 14:13:56[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mingest[0m:[36m42[0m - [1madding chunks to the collection[0m


 Simple RAG ready!


In [19]:
vector_store.count()

33

In [20]:
# Test simple version
print("\n" + "="*60)
print("SIMPLE RAG TEST")
print("="*60)

answer = simple_rag.query("What is Word2Vec?", top_k=3)

print("\nüìù Answer:")
print(answer)

[32m2026-01-31 14:13:56[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m58[0m - [1mquerying the results[0m


[32m2026-01-31 14:13:56[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m94[0m - [1mfinished the querying - found 3 unique results[0m



SIMPLE RAG TEST
üîç Searching for: What is Word2Vec?
[SearchResult(content="## 1.1 Goals of the Paper\n\nThe main goal of this paper is to introduce techniques that can be used for learning high-quality word vectors from huge data sets with billions of words, and with millions of words in the vocabulary. As far as we know, none of the previously proposed architectures has been successfully trained on more\n\n\n\nthan a few hundred of millions of words, with a modest dimensionality of the word vectors between 50 - 100.\n\nWe use recently proposed techniques for measuring the quality of the resulting vector representations, with the expectation that not only will similar words tend to be close to each other, but that words can have multiple degrees of similarity [20]. This has been observed earlier in the context of inflectional languages - for example, nouns can have multiple word endings, and if we search for similar words in a subspace of the original vector space, it is possible to

## Test Multi-Query Version

In [21]:
# Multi-query pipeline (with query enhancement)
query_constructor = MultiQueryConstructor(generator)

multi_rag = MultiQueryRAGPipeline(
    query_constructor=query_constructor,
    vector_store=vector_store,
    answerer=answerer
)

print("‚úÖ Multi-query RAG ready!")

‚úÖ Multi-query RAG ready!


In [22]:
# Test multi-query version
print("\n" + "="*60)
print("MULTI-QUERY RAG TEST")
print("="*60)

answer = multi_rag.query("how are the embeddings are constructed ?", top_k=5)

print("\nüìù Answer:")
print(answer)


MULTI-QUERY RAG TEST


[32m2026-01-31 14:14:00[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m58[0m - [1mquerying the results[0m
[32m2026-01-31 14:14:00[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m94[0m - [1mfinished the querying - found 12 unique results[0m


 Using 4 query variations
['how are the embeddings are constructed ?', 'Here are three potential search queries related to how embeddings are constructed:', '1. "How are word embeddings like Word2Vec and GloVe created?"', '2. "What is the process for constructing sentence embeddings using techniques like BERT and Sentence-BERT?"']
 Found 12 total results
[1] ## 7 Follow-Up Work

After the initial version of this paper was written, we published single-machine multi-threaded C++ code for computing the word vectors, using both the continuous bag-of-words and skip-gram architectures 4 . The training speed is significantly higher than reported earlier in this paper, i.e. it is in the order of billions of words per hour for typical hyperparameter choices. We also published more than 1.4 million vectors that represent named entities, trained on more than 100 billion words. Some of our follow-up work will be published in an upcoming NIPS 2013 paper [21].



[2] ## 3.1 Continuous Bag-of-Words M

## Debug: Check what's in the vector store

In [23]:
print(f"Vector store has {vector_store.count()} documents")

# Try a simple search
test_results = vector_store.query(["word embedding"], n_result=2)
print(f"\nFound {len(test_results)} results for 'word embedding'")

if test_results:
    print("\nFirst result preview:")
    print(test_results[0].content[:200] + "...")

[32m2026-01-31 14:14:01[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m58[0m - [1mquerying the results[0m


[32m2026-01-31 14:14:01[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m94[0m - [1mfinished the querying - found 2 unique results[0m


Vector store has 33 documents

Found 2 results for 'word embedding'

First result preview:
## Efficient Estimation of Word Representations in Vector Space

...
