# RAG System for Google Colab - Quick Start

This notebook provides a quick start guide to using the RAG system in Google Colab with Google Drive integration.

## 1. Setup

First, let's install the required packages:

In [None]:
!pip install faiss-cpu sentence-transformers transformers torch tqdm requests python-dotenv

Now, let's clone the repository:

In [None]:
!git clone https://github.com/yourusername/new_rag_colab.git
%cd new_rag_colab
!pip install -e .

## 2. Mount Google Drive

In [None]:
from new_rag_colab.utils.drive_utils import DriveHandler

# Create a Drive handler and mount Google Drive
drive_handler = DriveHandler()
drive_handler.mount_drive()

## 3. Create the RAG Pipeline

In [None]:
from new_rag_colab.processors.pdf_processor import PDFProcessor
from new_rag_colab.processors.text_processor import TextProcessor
from new_rag_colab.chunkers.base_chunker import FixedSizeChunker
from new_rag_colab.utils.embeddings import HuggingFaceEmbeddingProvider
from new_rag_colab.vector_stores.drive_vector_store import DriveVectorStore
from new_rag_colab.retrievers.base_retriever import SimpleRetriever
from new_rag_colab.utils.colab_rag_pipeline import ColabRAGPipeline

# Create components
embedding_provider = HuggingFaceEmbeddingProvider(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = DriveVectorStore(embedding_function=embedding_provider.get_embedding, dimension=384)
chunker = FixedSizeChunker(chunk_size=1000, chunk_overlap=200)
retriever = SimpleRetriever(vector_store)

# Create processors
processors = {
    "pdf": PDFProcessor(),
    "text": TextProcessor()
}

# Create the RAG pipeline
rag_pipeline = ColabRAGPipeline(
    chunker=chunker,
    vector_store=vector_store,
    retriever=retriever,
    processors=processors,
    drive_handler=drive_handler
)

## 4. Upload and Process Files

In [None]:
from google.colab import files

# Upload files
uploaded = files.upload()

# Process uploaded files
for filename in uploaded.keys():
    print(f"Processing {filename}...")
    doc_ids = rag_pipeline.process_file(filename)
    print(f"Added {len(doc_ids)} chunks from {filename}")

## 5. Save to Google Drive

In [None]:
import tempfile
from pathlib import Path

# Create a temporary directory
temp_dir = tempfile.mkdtemp()
vector_store_path = Path(temp_dir) / "vector_store"

# Save the vector store
rag_pipeline.save_vector_store(vector_store_path, drive_subfolder="quick_start_vector_store")
print("Vector store saved to Google Drive.")

## 6. Query the RAG System

In [None]:
# Function to query and display results
def query_rag(query_text):
    from IPython.display import Markdown
    
    print(f"Query: {query_text}")
    markdown_response = rag_pipeline.query_with_markdown(query_text)
    return Markdown(markdown_response.replace("```markdown\n", "").replace("\n```", ""))

In [None]:
# Try a query
query_rag("What information can you find in my documents?")

## 7. Interactive Query Interface

In [None]:
from ipywidgets import widgets
from IPython.display import display, clear_output

# Create widgets
query_input = widgets.Text(placeholder='Enter your query here', description='Query:', layout=widgets.Layout(width='80%'))
submit_button = widgets.Button(description='Submit', button_style='primary')
output = widgets.Output()

# Define callback
def on_submit(b):
    with output:
        clear_output()
        if query_input.value.strip():
            display(query_rag(query_input.value))
        else:
            print("Please enter a query.")

# Register callback
submit_button.on_click(on_submit)

# Display widgets
display(widgets.HBox([query_input, submit_button]))
display(output)