# RAG System Debugging Mode

This notebook demonstrates how to use the debugging capabilities of the RAG system in Google Colab.

## 1. Setup

First, let's install the required packages:

In [None]:
!pip install faiss-cpu sentence-transformers transformers torch tqdm requests python-dotenv

Now, let's clone the repository:

In [None]:
!git clone https://github.com/yourusername/new_rag_colab.git
%cd new_rag_colab
!pip install -e .

## 2. Configure Logging

Let's set up the debug logger with a more verbose logging level:

In [None]:
import logging
from new_rag_colab.utils.debug_utils import debug_logger, DebugInspector

# Set logging level to DEBUG for more detailed output
debug_logger.logger.setLevel(logging.DEBUG)
print(f"Debug logger configured with level: {logging.getLevelName(debug_logger.logger.level)}")

## 3. Mount Google Drive

In [None]:
from new_rag_colab.utils.drive_utils import DriveHandler

# Create a Drive handler and mount Google Drive
drive_handler = DriveHandler()
drive_handler.mount_drive()

## 4. Create the RAG Pipeline with Debug Mode

In [None]:
from new_rag_colab.processors.pdf_processor import PDFProcessor
from new_rag_colab.processors.text_processor import TextProcessor
from new_rag_colab.chunkers.base_chunker import FixedSizeChunker
from new_rag_colab.utils.embeddings import HuggingFaceEmbeddingProvider
from new_rag_colab.vector_stores.drive_vector_store import DriveVectorStore
from new_rag_colab.retrievers.base_retriever import SimpleRetriever
from new_rag_colab.utils.colab_rag_pipeline import ColabRAGPipeline

# Create components with debug mode enabled
embedding_provider = HuggingFaceEmbeddingProvider(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    use_cache=True
)

vector_store = DriveVectorStore(
    embedding_function=embedding_provider.get_embedding,
    dimension=384,
    drive_handler=drive_handler,
    debug=True  # Enable debug mode
)

chunker = FixedSizeChunker(chunk_size=1000, chunk_overlap=200)
retriever = SimpleRetriever(vector_store)

# Create processors
processors = {
    "pdf": PDFProcessor(),
    "text": TextProcessor()
}

# Create the RAG pipeline
rag_pipeline = ColabRAGPipeline(
    chunker=chunker,
    vector_store=vector_store,
    retriever=retriever,
    processors=processors,
    drive_handler=drive_handler
)

## 5. Upload and Process Files with Debug Output

In [None]:
from google.colab import files

# Upload files
print("Upload your files (PDF, TXT, JSON, CSV):")
uploaded = files.upload()

# Process uploaded files with debug output
for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")
    try:
        doc_ids = rag_pipeline.process_file(filename)
        print(f"Added {len(doc_ids)} chunks from {filename}")
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

## 6. Inspect Vector Store Contents

In [None]:
# Get debug information about the vector store
debug_info = vector_store.get_debug_info()

print(f"Vector Store Debug Information:")
print(f"- Document count: {debug_info['document_count']}")
print(f"- Embedding dimension: {debug_info['dimension']}")
print(f"- FAISS index size: {debug_info['index_size']}")
print(f"\nMetadata fields: {', '.join(debug_info['metadata_fields'])}")
print(f"\nSource files: {', '.join(debug_info['source_files'])}")
print(f"\nSource types: {', '.join(debug_info['source_types'])}")

## 7. Inspect Document Contents

In [None]:
# Inspect the first few documents
print(f"Vector store contains {len(vector_store.documents)} documents")
print("\nSample documents:")

for i, doc in enumerate(vector_store.documents[:3]):
    print(f"\nDocument {i+1}:")
    print(f"- ID: {doc.get('id', 'unknown')}")
    print(f"- Metadata: {doc.get('metadata', {})}")
    content = doc.get('content', '')
    print(f"- Content: {content[:100]}..." if len(content) > 100 else f"- Content: {content}")

## 8. Save Vector Store with Debug Output

In [None]:
import tempfile
from pathlib import Path

# Create a temporary directory
temp_dir = tempfile.mkdtemp()
vector_store_path = Path(temp_dir) / "debug_vector_store"

# Save the vector store with debug output
print("Saving vector store to Google Drive...")
vector_store.save(vector_store_path, drive_subfolder="debug_vector_store")
print("Vector store saved to Google Drive.")

## 9. Load Vector Store with Debug Output

In [None]:
# Create a new vector store
new_vector_store = DriveVectorStore(
    embedding_function=embedding_provider.get_embedding,
    dimension=384,
    drive_handler=drive_handler,
    debug=True
)

# Create a new temporary directory
temp_dir = tempfile.mkdtemp()
load_path = Path(temp_dir) / "loaded_vector_store"

# Load the vector store with debug output
print("Loading vector store from Google Drive...")
new_vector_store.load(load_path, from_drive=True, drive_path="debug_vector_store")
print("Vector store loaded from Google Drive.")

# Verify the loaded vector store
print(f"Loaded {len(new_vector_store.documents)} documents")

## 10. Query with Debug Output

In [None]:
# Create a new retriever with the loaded vector store
new_retriever = SimpleRetriever(new_vector_store)

# Create a new RAG pipeline
new_rag_pipeline = ColabRAGPipeline(
    chunker=chunker,
    vector_store=new_vector_store,
    retriever=new_retriever,
    processors=processors,
    drive_handler=drive_handler
)

# Query with debug output
query = "What information can you find in my documents?"
print(f"Query: {query}")
results = new_rag_pipeline.query(query)

print(f"\nFound {len(results)} results:")
for i, result in enumerate(results):
    print(f"\nResult {i+1} (score: {result.get('score', 0):.4f}):")
    print(f"Source: {result.get('metadata', {}).get('source_file', 'unknown')}")
    content = result.get('content', '')
    print(f"Content: {content[:100]}..." if len(content) > 100 else f"Content: {content}")

## 11. Debug Utilities

In [None]:
# Use DebugInspector to inspect objects
print("Vector Store Inspection:")
DebugInspector.print_vector_store_info(vector_store)

print("\nRetriever Inspection:")
DebugInspector.print_object_info(retriever, "Retriever")

print("\nChunker Inspection:")
DebugInspector.print_object_info(chunker, "Chunker")

## 12. Performance Monitoring

In [None]:
# Run multiple queries and measure performance
queries = [
    "What is machine learning?",
    "How does neural network work?",
    "What are the benefits of deep learning?",
    "Explain natural language processing"
]

print("Running performance test...\n")

for query in queries:
    debug_logger.start_timer(f"query_{query[:20]}")
    results = new_rag_pipeline.query(query)
    duration = debug_logger.end_timer(f"query_{query[:20]}")
    print(f"Query: '{query}' - {len(results)} results in {duration:.4f} seconds")