# Complete RAG Pipeline with Ollama
## Fixed and Ready to Use!

In [1]:
import sys
from pathlib import Path

# Setup project path
project_root = str(Path.cwd().parent.parent)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
from abc import ABC, abstractmethod
from typing import List
import ollama
from src.utils.config import LLMConfig, get_config
from src.shared.models import SearchResult

## 1. Setup Ollama Manager

In [3]:
class OllamaManager:
    @staticmethod
    def ensure_ready(model_name: str = "llama3.2"):
        """Ensure Ollama server is running and model is available"""
        try:
            # Check if server is running
            models = ollama.list()
            
            # Check if model exists
            if not any(model_name in m['name'] for m in models['models']):
                print(f"üì• Downloading {model_name} (this may take a few minutes)...")
                ollama.pull(model_name)
                print(f"‚úÖ Model {model_name} downloaded successfully")
            else:
                print(f"‚úÖ Model {model_name} is ready")
            
            return True
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Ollama not running!")
            print(f"   Please run in a terminal: ollama serve")
            print(f"   Then pull the model: ollama pull {model_name}")
            return False

## 2. Generator Classes

In [4]:
class BaseGenerator(ABC):
    @abstractmethod
    def generate(self, prompt: str) -> str:
        """Generate text from prompt"""
        pass


class OllamaGenerator(BaseGenerator):
    def __init__(self, config: LLMConfig, auto_setup: bool = True):
        self.model = config.model_name
        self.temperature = config.temperature
        
        if auto_setup:
            OllamaManager.ensure_ready(self.model)
    
    def generate(self, prompt: str) -> str:
        try:
            response = ollama.chat(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": self.temperature}
            )
            return response['message']['content']
        except Exception as e:
            return f"‚ùå Error: {e}\nIs Ollama running? Run 'ollama serve'"

## 3. Query Constructor

In [5]:
class QueryConstructor(ABC):
    @abstractmethod
    def refine_query(self, query: str) -> list[str]:
        pass


class MultiQueryConstructor(QueryConstructor):
    def __init__(self, generator: BaseGenerator) -> None:
        super().__init__()
        self.generator = generator
        self.template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines,And give just the questions nothing else. Original question: {question}"""

    def refine_query(self, query: str) -> list[str]:
        prompt = self.template.format(question=query)
        response = self.generator.generate(prompt)
        
        # Parse response into individual queries
        queries = [q.strip() for q in response.split('\n') if q.strip()]
        
        # Include original query
        return [query] + queries

## 4. Answer Generator

In [6]:
class BaseQueryAnswerer(ABC):
    @abstractmethod
    def answer(self, result_search: List[SearchResult], query: str) -> str:
        pass


class QueryAnswerer(BaseQueryAnswerer):
    def __init__(self, generator: BaseGenerator) -> None:
        super().__init__()
        self.generator = generator
        self.template = """You are a helpful assistant answering questions based on provided context.

Context:
{context}

Question: {question}

Instructions:
- Answer based only on the provided context
- If the context doesn't contain enough information, say so
- Be concise but complete

Answer:"""
    
    def answer(self, result_search: List[SearchResult], query: str) -> str:
        if not result_search:
            return "No relevant documents found to answer this question."
        
        # Build context from search results
        context_parts = []
        for i, result in enumerate(result_search, 1):
            context_parts.append(f"[Document {i}]\n{result.content}")
        
        context = "\n\n".join(context_parts)
        
        # Generate answer
        prompt = self.template.format(context=context, question=query)
        answer = self.generator.generate(prompt)
        
        return answer.strip()

## 5. RAG Pipeline (FIXED)

In [7]:
from src.ingestion.vector_store.stores import ChromaStore

class RAGPipeline:
    def __init__(
        self,
        query_constructor: QueryConstructor,
        vector_store: ChromaStore, 
        answerer: BaseQueryAnswerer
    ):
        self.query_constructor = query_constructor
        self.vector_store = vector_store
        self.answerer = answerer
    
    def query(self, user_query: str, top_k: int = 5) -> str:
        # Get enhanced queries
        enhanced_queries = self.query_constructor.refine_query(user_query)
        print(f"üîç Generated {len(enhanced_queries)} query variations")
        print(f"the queries are {enhanced_queries}")
        
        # Query returns List[List[SearchResult]] - one list per query
        results_nested = self.vector_store.query_flattened(enhanced_queries, n_result=top_k)
        
        # Flatten the nested list (THIS WAS THE BUG!)
        all_results = []
        for query_results in results_nested:
            all_results.extend(query_results)
        
        print(f"üìÑ Retrieved {len(all_results)} total results")
        print(results_nested)
        
        # Take top_k results
        top_results = all_results[:top_k]
        
        # Generate answer
        print(f"üí≠ Generating answer...")
        answer = self.answerer.answer(top_results, user_query)
        
        return answer

  from .autonotebook import tqdm as notebook_tqdm


## 6. Test the Complete System

In [8]:
# First, update your config to use Ollama
# You need to manually edit config/config.yaml and change:
# llm:
#   provider: ollama
#   model_name: llama3.2
#   base_url: http://localhost:11434
#   temperature: 0.7

print("‚ö†Ô∏è  IMPORTANT: Make sure you've updated config/config.yaml to use Ollama!")
print("    See the cell above for the required changes.")
print("\nüìã Loading configuration...")

‚ö†Ô∏è  IMPORTANT: Make sure you've updated config/config.yaml to use Ollama!
    See the cell above for the required changes.

üìã Loading configuration...


In [9]:
from src.utils.config import settings
print(settings.llm)
# Create generator
print("ü§ñ Initializing Ollama generator...")
generator = OllamaGenerator(settings.llm, auto_setup=True)

# Create components
print("üîß Setting up RAG components...")
query_constructor = MultiQueryConstructor(generator)
answerer = QueryAnswerer(generator)
vector_store = ChromaStore(settings.vector_store)

# Create pipeline
rag = RAGPipeline(
    query_constructor=query_constructor,
    vector_store=vector_store,
    answerer=answerer
)

print("‚úÖ RAG pipeline ready!")

[32m2026-01-29 22:59:20[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m19[0m - [1mcreating or getting the collection[0m
[32m2026-01-29 22:59:20[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m25[0m - [1mgetting the embedder[0m
[32m2026-01-29 22:59:20[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m19[0m - [1mLoading SentenceTransformer model: all-MiniLM-L6-v2[0m


provider='ollama' model_name='llama3.2' api_key=None base_url=None temperature=0.1
ü§ñ Initializing Ollama generator...
‚ö†Ô∏è  Ollama not running!
   Please run in a terminal: ollama serve
   Then pull the model: ollama pull llama3.2
üîß Setting up RAG components...


[32m2026-01-29 22:59:27[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m27[0m - [1mModel loaded: 384d on cpu[0m


‚úÖ RAG pipeline ready!


In [10]:
# Test it!
print("\n" + "="*60)
print("Testing RAG Pipeline")
print("="*60 + "\n")

answer = rag.query("Talk about Word to vector implementation ?", top_k=5)

print("\n" + "="*60)
print("ANSWER:")
print("="*60)
print(answer)


Testing RAG Pipeline



[32m2026-01-29 22:59:29[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery_flattened[0m:[36m87[0m - [1mquerying the results[0m
[32m2026-01-29 22:59:29[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery_flattened[0m:[36m111[0m - [1mfinished the querying[0m


üîç Generated 6 query variations
the queries are ['Talk about Word to vector implementation ?', 'How does Word2Vec work in a vector database?', 'What are the key concepts and techniques behind Word2Vec?', 'Can you explain the Word2Vec algorithm and its applications in natural language processing?', 'How does Word2Vec handle out-of-vocabulary words in a vector database?', 'What are some common use cases for Word2Vec in text analysis and information retrieval?']
üìÑ Retrieved 0 total results
[]
üí≠ Generating answer...

ANSWER:
No relevant documents found to answer this question.


## 7. Test Simple Query (No RAG)

In [11]:
# Test generator directly
simple_response = generator.generate("Explain what RAG is in one sentence.")
print("Simple query test:")
print(simple_response)

Simple query test:
RAG (Rescue and Adoption Group) typically refers to a non-profit organization that specializes in rescuing and rehoming animals, often with a focus on specific breeds or types of pets.
