# 🚀 Start Project From Here

Welcome! If you have downloaded this project from GitHub, please follow these steps to get started:

1. **Read the `README.md` file** for setup instructions and environment requirements.  
2. **Download all required models and data** as described in the README.  
3. **Run this Jupyter Notebook** starting from this cell to initialize and use the project.  
4. All code and pipeline steps are organized below for easy execution.

> **Note:** Make sure you have installed all dependencies and placed the necessary files in the correct directories as per the instructions.

Happy coding!


In [None]:
# Run this first to donwalod all the required libraries

%pip install numpy transformers huggingface_hub numpy sentence-transformers

%pip install faiss-cpu

%pip install llama-cpp-python

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip




### Donwloading the Model: "intfloat__e5-base"

In [None]:
from transformers import AutoTokenizer, AutoModel
import os
import shutil

from huggingface_hub import snapshot_download

# Target download directory
local_model_dir = os.path.join("models", "intfloat__e5-base")

# Download from Hugging Face to a temp location
print("📥 Downloading model...")
downloaded_path = snapshot_download(repo_id="intfloat/e5-base")

# Copy downloaded model to desired directory if not already there
if not os.path.exists(local_model_dir):
    print(f"📦 Copying model to {local_model_dir} ...")
    shutil.copytree(downloaded_path, local_model_dir)

print(f"✅ Model is saved in {local_model_dir}")


### GO to this link adn download the LLM MODEL: Llama-3.2-1B-Instruct-Q4_K_M.gguf

### LINK: https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/blob/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf

### CLick on download and store in location of Models directory

### --------------------------------------------------------------------------------------------------------------
### --------------------------------------------------------------------------------------------------------------


# Load the Models Once In Memory. 
## This is done so you don't have to load it everytime.

### --------------------------------------------------------------------------------------------------------------
### --------------------------------------------------------------------------------------------------------------



In [None]:
import json
import numpy as np
import faiss
import time
from typing import List, Dict, Any, Tuple
from sentence_transformers import SentenceTransformer
import pickle
import os
from dataclasses import dataclass
from pathlib import Path

# Try to import llama-cpp-python
try:
    from llama_cpp import Llama
    LLAMA_CPP_AVAILABLE = True
except ImportError:
    LLAMA_CPP_AVAILABLE = False
    print("Warning: llama-cpp-python not installed. LLM generation will be simulated.")
    print("         Install with: pip install llama-cpp-python (or llama-cpp-python[cuda] for GPU)")

# Configuration for local setup
@dataclass
class RAGConfig:
    """Configuration class for RAG pipeline"""
    # Paths (relative to where you run the script, assuming standard project structure)
    # These should point to the output of your Step 4 (FAISS Vector Store Creation)
    faiss_index_path: str = "faiss_vector_store/faiss_index.index"
    chunk_metadata_path: str = "faiss_vector_store/chunk_metadata.json"
    vector_store_metadata_path: str = "faiss_vector_store/vector_store_metadata.json"
    
    # Model paths
    # SentenceTransformer will automatically download 'intfloat/e5-base' to your local cache
    embedding_model_name: str = "intfloat/e5-base"
    # LLaMA 3.2-1B GGUF model path: You need to download this model manually
    # (e.g., from Hugging Face Hub) and place it in your local 'models/' folder.
    llm_model_path: str = "models/llama-3.2-1b-instruct-q4_k_m.gguf"  # IMPORTANT: Update this path!
    
    # Retrieval parameters
    top_k_dense: int = 3  # Number of chunks to retrieve
    similarity_threshold: float = 0.4  # Minimum similarity score (adjust if too many irrelevant chunks or too few relevant)
    
    # LLM parameters
    max_tokens: int = 1200  # Max tokens for LLM response
    temperature: float = 0.6
    top_p: float = 0.85       ## FFooccuusseedd bbuutt nnoott ttoooo rreessttrriiccttiivve
    context_length: int = 2800  # Context window for LLaMA 3.2-1B (ensure it matches the model's actual capacity)

class DenseRetriever:
    """Handles dense retrieval using FAISS and sentence transformers"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        self.embedding_model = None
        self.faiss_index = None
        self.chunk_metadata = None
        self.vector_store_metadata = None
        
    def initialize(self):
        """Initialize the retriever components"""
        print("\n" + "="*80)
        print("🔄 INITIALIZING DENSE RETRIEVER")
        print("="*80)
        
        # Load embedding model
        print("📥 Loading embedding model (this may take a moment, downloads if not cached)...")
        start_time = time.time()
        # SentenceTransformer manages model download and loading
        self.embedding_model = SentenceTransformer(self.config.embedding_model_name)
        print(f"✅ Embedding model loaded in {time.time() - start_time:.2f} seconds")
        
        # Load FAISS index
        print("📥 Loading FAISS index...")
        start_time = time.time()
        if not os.path.exists(self.config.faiss_index_path):
            raise FileNotFoundError(f"FAISS index not found at: {self.config.faiss_index_path}\n"
                                    "Please check your RAGConfig.faiss_index_path and ensure "
                                    "you've downloaded 'faiss_vector_store' folder from Drive.")
        
        self.faiss_index = faiss.read_index(self.config.faiss_index_path)
        print(f"✅ FAISS index loaded in {time.time() - start_time:.2f} seconds")
        print(f"📊 Index contains {self.faiss_index.ntotal} vectors")
        
        # Load chunk metadata
        print("📥 Loading chunk metadata...")
        if not os.path.exists(self.config.chunk_metadata_path):
            raise FileNotFoundError(f"Chunk metadata not found at: {self.config.chunk_metadata_path}\n"
                                    "Please check your RAGConfig.chunk_metadata_path and ensure "
                                    "you've downloaded 'faiss_vector_store' folder from Drive.")
        with open(self.config.chunk_metadata_path, 'r', encoding='utf-8') as f:
            self.chunk_metadata = json.load(f)
        
        # Load vector store metadata (optional, but good for tracking)
        print("📥 Loading vector store metadata...")
        if not os.path.exists(self.config.vector_store_metadata_path):
            print(f"⚠️ Vector store metadata not found at: {self.config.vector_store_metadata_path}\n"
                  "Proceeding without it, but consider checking the path.")
            self.vector_store_metadata = {}
        else:
            with open(self.config.vector_store_metadata_path, 'r', encoding='utf-8') as f:
                self.vector_store_metadata = json.load(f)
            
        print(f"✅ Loaded metadata for {len(self.chunk_metadata)} chunks")
        print("🚀 Dense Retriever initialized successfully!")
        
    def embed_query(self, query: str) -> np.ndarray:
        """Embed a query using the same model used for documents"""
        # Add query prefix for optimal e5 performance as per model's guidelines
        prefixed_query = f"query: {query}"
        
        start_time = time.time()
        # Ensure embedding is float32 as FAISS expects this
        embedding = self.embedding_model.encode([prefixed_query], normalize_embeddings=True).astype(np.float32)
        embed_time_ms = (time.time() - start_time) * 1000  # Convert to milliseconds
        
        print(f"⚡ Query embedded in {embed_time_ms:.1f}ms")
        return embedding[0] # Return the single embedding vector
    
    def search_similar_chunks(self, query_embedding: np.ndarray) -> List[Dict[str, Any]]:
        """Search for similar chunks using FAISS"""
        start_time = time.time()
        
        # Perform similarity search
        # query_embedding must be 2D array: (1, embedding_dim)
        scores, indices = self.faiss_index.search(
            query_embedding.reshape(1, -1), 
            self.config.top_k_dense # Retrieve top_k_dense chunks
        )
        
        search_time_ms = (time.time() - start_time) * 1000  # Convert to milliseconds
        print(f"🔍 FAISS search completed in {search_time_ms:.1f}ms")
        
        # Prepare results, filtering by similarity threshold
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            # Check if index is valid and score meets threshold
            if idx < len(self.chunk_metadata) and score >= self.config.similarity_threshold:
                chunk_data = self.chunk_metadata[idx].copy()
                chunk_data['similarity_score'] = float(score)
                chunk_data['retrieval_rank'] = i + 1 # Rank from FAISS search
                results.append(chunk_data)
            # If score is below threshold, stop adding more results (since FAISS returns sorted by score)
            elif score < self.config.similarity_threshold:
                break
        
        print(f"📋 Retrieved {len(results)} relevant chunks (min threshold: {self.config.similarity_threshold:.2f})")
        return results
    
    def retrieve(self, query: str) -> List[Dict[str, Any]]:
        """Main retrieval method combining embedding and search"""
        print(f"\n" + "-"*80)
        print(f"🔍 RETRIEVAL PHASE FOR QUERY: '{query[:100]}{'...' if len(query) > 100 else ''}'")
        print("-"*80)
        
        # Embed query
        query_embedding = self.embed_query(query)
        
        # Search for similar chunks
        results = self.search_similar_chunks(query_embedding)
        
        # Log results summary (optional, but useful for debugging retrieval)
        if results:
            print(f"\n📊 TOP {len(results)} RETRIEVED CHUNKS (Summary):")
            for i, result_chunk in enumerate(results):
                file_name = Path(result_chunk.get('source_file', 'Unknown')).name
                print(f"  {result_chunk['retrieval_rank']}. File: {file_name} (Score: {result_chunk['similarity_score']:.4f})")
        else:
            print("\n⚠️ No chunks retrieved for this query based on set parameters.")

        return results

class LLMGenerator:
    """Handles LLM generation using LLaMA 3.2-1B"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        self.llm = None
        
    def initialize(self):
        """Initialize the LLM"""
        print("\n" + "="*80)
        print("🤖 INITIALIZING LLM GENERATOR")
        print("="*80)
        
        if not LLAMA_CPP_AVAILABLE:
            print("⚠️  llama-cpp-python not available. LLM generation will be simulated.")
            return
            
        if not os.path.exists(self.config.llm_model_path):
            print(f"⚠️  LLM model not found at: {self.config.llm_model_path}")
            print("📝 Please download a LLaMA 3.2-1B GGUF model and update the RAGConfig.llm_model_path.")
            print("   Using simulation mode for LLM generation.")
            return
        
        print("📥 Loading LLaMA 3.2-1B GGUF model (this can take several seconds)...")
        start_time = time.time()
        
        try:
            # n_gpu_layers=-1 attempts to offload all layers to GPU if available and llama-cpp-python[cuda] is installed
            self.llm = Llama(
                model_path=self.config.llm_model_path,
                n_ctx=self.config.context_length,
                n_threads=os.cpu_count(),  # Use all available CPU cores
                verbose=False, # Set to True for verbose LLM loading output
                n_gpu_layers=-1 # Try to use all GPU layers if available
            )
            print(f"✅ LLM loaded in {time.time() - start_time:.2f} seconds")
            print("🚀 LLM Generator initialized successfully!")
        except Exception as e:
            print(f"❌ Error loading LLM from {self.config.llm_model_path}: {e}")
            print("   Ensure the GGUF model is valid and the path is correct.")
            print("   If using GPU, ensure llama-cpp-python[cuda] is installed and compatible drivers are present.")
            print("   Using simulation mode for LLM generation.")
            self.llm = None
    
    def format_prompt(self, query: str, retrieved_chunks: List[Dict[str, Any]]) -> str:
        """Format the prompt with retrieved context for LLaMA 3.2 Instruct"""
        # Build context from retrieved chunks
        context_parts = []
        for i, chunk in enumerate(retrieved_chunks):
            source_file_name = Path(chunk.get('source_file', 'Unknown Document')).name
            text = chunk.get('text', '')
            score = chunk.get('similarity_score', 0.0)
            
            context_parts.append(
                f"<document id={i+1} source={source_file_name} relevance={score:.3f}>\n{text}\n</document>"
            )
        
        context_string = "\n".join(context_parts)
        
        # Enhanced LLaMA 3.2 Instruct format with more robust instructions
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an intelligent AI assistant that provides comprehensive and helpful answers. Your primary task is to answer questions using the provided context documents, but you should also apply your knowledge and reasoning to give complete, useful responses.

INSTRUCTIONS:
1. **PRIMARY SOURCE**: Use information from the provided context documents as your main source
2. **CITE SOURCES**: Always cite document sources using format [Document X: filename.ext] when referencing specific information
3. **BE COMPREHENSIVE**: If the documents provide partial information, supplement with your own knowledge while clearly distinguishing between document-based and general knowledge
4. **BE DIRECT**: Give clear, actionable answers rather than overly cautious responses
5. **NO RELEVANT DOCUMENTS**: If NO documents contain relevant information, respond with: "NO RELEVANT INFORMATION FOUND in the provided documents for this query."
6. **PARTIAL INFORMATION**: If documents contain some relevant info but not a complete answer, use what's available and supplement thoughtfully
7. **SYNTHESIZE**: Combine information from multiple documents when applicable
8. **BE HELPFUL**: Your goal is to be maximally helpful to the user while being accurate

<|eot_id|><|start_header_id|>user<|end_header_id|>

Context Documents:
{context_string}

Question: {query}

Please provide a comprehensive and helpful answer.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
        return prompt
    
    def generate_response(self, query: str, retrieved_chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate response using LLM"""
        print(f"\n" + "-"*80)
        print(f"🤖 GENERATION PHASE")
        print("-"*80)
        
        if not retrieved_chunks:
            print("⚠️ No relevant chunks were retrieved. Generating rigid 'no information found' response.")
            return {
                'response': "NO RELEVANT INFORMATION FOUND in the provided documents for this query.",
                'sources': [],
                'generation_time': 0,
                'token_count': 0,
                'simulated': True,
                'no_context': True
            }
        
        # Format prompt
        prompt = self.format_prompt(query, retrieved_chunks)
        
        if self.llm is None:
            # Enhanced simulation mode with more realistic responses
            print("🔄 Simulating LLM response (llama-cpp-python not loaded or model not found)...")
            time.sleep(1.5)  # Simulate processing time
            
            # Get unique source filenames for simulated response
            sources_list = sorted(list(set(Path(chunk.get('source_file', 'Unknown')).name for chunk in retrieved_chunks)))
            
            # More realistic simulated response
            simulated_response = f"""Based on the retrieved document context, I can provide information related to your query: "{query}".

**Sources Found**: {len(retrieved_chunks)} relevant document chunks from: {', '.join(sources_list)}

**Key Information**: The documents contain relevant information with similarity scores ranging from {min(chunk.get('similarity_score', 0) for chunk in retrieved_chunks):.3f} to {max(chunk.get('similarity_score', 0) for chunk in retrieved_chunks):.3f}.

[SIMULATED RESPONSE - Install llama-cpp-python and download the LLaMA 3.2-1B GGUF model for actual AI-generated answers]

**Answer**: [The actual LLM would provide a comprehensive answer here synthesizing the document content with additional context and reasoning]
"""
            print(f"✅ Simulated response generated in 1.5 seconds.")
            return {
                'response': simulated_response,
                'sources': sources_list,
                'generation_time': 1.5,
                'token_count': len(simulated_response.split()), # Rough estimate
                'simulated': True
            }
        
        # Real LLM generation
        print(f"🧠 Calling LLaMA 3.2-1B for generation...")
        start_time = time.time()
        
        try:
            # Use 'create_completion' for simpler text generation if 'chat.completion' is not desired
            output = self.llm.create_completion(
                prompt,
                max_tokens=self.config.max_tokens,
                temperature=self.config.temperature,
                top_p=self.config.top_p,
                stop=["<|eot_id|>"], # Stop sequence for LLaMA 3.2 instruct format
                echo=False # Do not echo the prompt in the response
            )
            
            generation_time = time.time() - start_time
            response_text = output['choices'][0]['text'].strip()
            
            # Extract unique source filenames from the retrieved chunks for metadata
            sources_list = sorted(list(set(Path(chunk.get('source_file', 'Unknown')).name for chunk in retrieved_chunks)))
            
            print(f"✅ LLM Response generated in {generation_time:.2f} seconds")
            print(f"📊 Tokens generated: {output['usage']['completion_tokens']}")
            
            return {
                'response': response_text,
                'sources': sources_list,
                'generation_time': generation_time,
                'token_count': output['usage']['completion_tokens'],
                'simulated': False
            }
            
        except Exception as e:
            print(f"❌ Error during LLM generation: {e}")
            print("   Ensure the prompt fits within context length and model is loaded correctly.")
            return {
                'response': f"An error occurred during response generation. Please check the logs. Error: {str(e)}",
                'sources': [],
                'generation_time': 0,
                'token_count': 0,
                'error': True,
                'simulated': False
            }

class RAGPipeline:
    """Complete RAG Pipeline combining retrieval and generation"""
    
    def __init__(self, config: RAGConfig = None):
        self.config = config or RAGConfig()
        self.retriever = DenseRetriever(self.config)
        self.generator = LLMGenerator(self.config)
        
    def initialize(self):
        """Initialize both retriever and generator"""
        print("\n" + "="*80)
        print("🚀 INITIALIZING COMPLETE RAG PIPELINE")
        print("="*80)
        self.retriever.initialize()
        self.generator.initialize()
        print("\n" + "="*80)
        print("✅ RAG PIPELINE READY!")
        print("="*80)
    
    def save_pipeline(self, filepath: str = "rag_pipeline.pkl"):
        """Save the initialized pipeline to a file"""
        print(f"\n💾 Saving RAG pipeline to {filepath}...")
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(self, f)
            print(f"✅ RAG pipeline saved successfully!")
        except Exception as e:
            print(f"❌ Error saving pipeline: {e}")
    
    @classmethod
    def load_pipeline(cls, filepath: str = "rag_pipeline.pkl"):
        """Load a pre-initialized pipeline from a file"""
        print(f"\n📥 Loading RAG pipeline from {filepath}...")
        try:
            with open(filepath, 'rb') as f:
                pipeline = pickle.load(f)
            print(f"✅ RAG pipeline loaded successfully!")
            return pipeline
        except Exception as e:
            print(f"❌ Error loading pipeline: {e}")
            return None
        
    def query(self, question: str) -> Dict[str, Any]:
        """Process a complete RAG query"""
        start_time = time.time()
        
        # Step 5: Dense Retrieval
        retrieved_chunks = self.retriever.retrieve(question)
        retrieval_time = time.time() - start_time
        
        # Step 6: LLM Generation
        generation_start = time.time()
        result = self.generator.generate_response(question, retrieved_chunks)
        generation_time = time.time() - generation_start
        
        total_time = time.time() - start_time
        
        # Compile final result
        final_result = {
            'query': question,
            'retrieved_chunks_count': len(retrieved_chunks),
            'retrieval_time': retrieval_time,
            'generation_time': generation_time,
            'total_time': total_time,
            'response': result['response'],
            'sources': result['sources'], # Unique source filenames
            'chunk_details': retrieved_chunks, # Full data for each retrieved chunk
            'no_context': result.get('no_context', False) # Flag for when no relevant docs found
        }
        
        return final_result

def setup_rag_pipeline():
    """Setup and initialize the RAG pipeline, then save it for later use"""
    print("="*80)
    print("🚀 RAG PIPELINE SETUP AND INITIALIZATION")
    print("="*80)
    
    # --- IMPORTANT LOCAL SETUP STEPS ---
    # 1. Ensure required libraries are installed:
    #    pip install numpy faiss-cpu sentence-transformers llama-cpp-python
    #    (Use llama-cpp-python[cuda] if you have compatible GPU for LLM)
    # 2. Download FAISS vector store folder:
    #    Download 'faiss_vector_store' from your Google Drive and place it
    #    in the same directory as this script.
    # 3. Download LLaMA 3.2-1B GGUF model:
    #    Find 'llama-3.2-1b-instruct-q4_k_m.gguf' (or similar) on Hugging Face.
    #    Create a 'models/' folder in the same directory as this script,
    #    and place the GGUF file inside it: models/llama-3.2-1b-instruct-q4_k_m.gguf
    #    Ensure the 'llm_model_path' in RAGConfig exactly matches this path and filename.
    # ---
    
    # Initialize configuration
    config = RAGConfig()
    
    # Initialize RAG pipeline
    rag = RAGPipeline(config)
    
    try:
        # Load all models and data
        rag.initialize()
        
        # Save the initialized pipeline for later use
        rag.save_pipeline()
        
        print("\n" + "="*80)
        print("✅ SETUP COMPLETE!")
        print("The RAG pipeline has been initialized and saved.")
        print("You can now run the query session script.")
        print("="*80)
        
        return rag
        
    except Exception as e:
        print(f"\n❌ An error occurred during setup: {e}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    setup_rag_pipeline()

## NOW write you query it will take generally 50 seconds to respond

### Ask as many queries you want the output will properly display all the information, make sure to click scrollable element in output so that you able to see all the output parts 

### or copy the output of the cell and paste it to word if you donot know what scrollabel element is 

### Write exit when you want to stop

In [None]:
import pickle
import time
from pathlib import Path
from typing import Dict, Any

def load_rag_pipeline(filepath: str = "rag_pipeline.pkl"):
    """Load a pre-initialized RAG pipeline from file"""
    print(f"\n📥 Loading RAG pipeline from {filepath}...")
    try:
        with open(filepath, 'rb') as f:
            pipeline = pickle.load(f)
        print(f"✅ RAG pipeline loaded successfully!")
        return pipeline
    except FileNotFoundError:
        print(f"❌ Pipeline file not found at {filepath}")
        print("Please run the setup script first to initialize and save the pipeline.")
        return None
    except Exception as e:
        print(f"❌ Error loading pipeline: {e}")
        return None

def display_query_result(result: Dict[str, Any]):
    """Display the results of a RAG query in a formatted way"""
    print(f"\n" + "="*80)
    print(f"FINAL RAG RESPONSE FOR: '{result['query']}'")
    print("="*80)
    
    # Display the generated response
    print(f"\n📝 Generated Response:")
    print("-" * 70)
    print(result['response'])
    print("-" * 70)
    
    # Display details of the retrieved documents
    if result['chunk_details']:
        print(f"\n📚 Retrieved Documents (Top {result['retrieved_chunks_count']}):")
        print("-" * 70)
        for j, chunk in enumerate(result['chunk_details']):
            # Path().name extracts just the filename from the full path
            file_name = Path(chunk.get('source_file', 'Unknown')).name 
            print(f"  {j+1}. Document: {file_name}")
            print(f"     Category: {chunk.get('category', 'Unknown')}")
            print(f"     Relevance Score: {chunk.get('similarity_score', 0):.4f}")
            # You can uncomment the line below if you want a small text preview of each chunk
            # print(f"     Preview: {chunk.get('text', '')[:150]}...")
            print("-" * 70) # Separator for each document
    else:
        print("\n⚠️ No relevant documents were retrieved for this query.")
    
    # Display overall pipeline metrics
    print(f"\n📊 Pipeline Metrics:")
    print(f"  • Retrieval Time: {result['retrieval_time']:.3f} seconds")
    print(f"  • Generation Time: {result['generation_time']:.3f} seconds")
    print(f"  • Total Time: {result['total_time']:.3f} seconds")
    print("\n" + "="*80 + "\n") # End of query summary for better spacing

def run_interactive_session(rag_pipeline):
    """Run a simple interactive query session with the loaded RAG pipeline"""
    print("\n" + "="*80)
    print("🚀 RAG PIPELINE INTERACTIVE SESSION")
    print("Type 'exit' or 'quit' to end the session.")
    print("="*80)
    
    while True:
        # Simple prompt for user query
        print("\n" + "-"*50)
        user_query = input("🤔 Your question: ").strip()
        
        # Check for exit commands
        if user_query.lower() in ['exit', 'quit']:
            print("\nGoodbye! 👋")
            break
            
        # Skip empty queries
        if not user_query:
            print("⚠️ Please enter a question.")
            continue
            
        # Process the query
        try:
            print("\n🔄 Processing your question...")
            result = rag_pipeline.query(user_query)
            
            # Display just the response cleanly
            print(f"\n💡 **Answer:**")
            print("-" * 50)
            print(result['response'])
            print("-" * 50)
            
            # Show sources if available
            if result['sources']:
                print(f"\n📚 **Sources:** {', '.join(result['sources'])}")
            
            # Show timing
            print(f"⏱️ **Response time:** {result['total_time']:.2f} seconds")
            
        except Exception as e:
            print(f"❌ Error: {e}")
            print("Please try asking your question differently.")

def run_single_query(rag_pipeline, query: str):
    """Run a single query without interactive mode"""
    print(f"\n🔍 Processing single query: '{query}'")
    
    try:
        result = rag_pipeline.query(query)
        display_query_result(result)
        return result
    except Exception as e:
        print(f"❌ Error processing query: {e}")
        return None

def main():
    """Main function for the query session"""
    print("="*80)
    print("🚀 RAG PIPELINE QUERY SESSION")
    print("="*80)
    
    # Load the pre-initialized pipeline
    rag_pipeline = load_rag_pipeline()
    
    if rag_pipeline is None:
        print("\n❌ Could not load RAG pipeline.")
        print("Please run the setup script first:")
        print("  python rag_setup.py")
        return
    
    print("\n✅ RAG pipeline loaded and ready!")
    
    # Run the interactive session
    run_interactive_session(rag_pipeline)

if __name__ == "__main__":
    main()