# RAG System with Google Drive Integration

This notebook demonstrates how to use the RAG system with Google Drive integration for persistent storage of vector stores.

## 1. Setup and Installation

First, let's install the required dependencies:

In [None]:
!pip install faiss-cpu sentence-transformers transformers torch tqdm requests python-dotenv

Now, let's clone the repository and install it:

In [None]:
# Clone the repository (replace with your actual repository URL)
!git clone https://github.com/yourusername/new_rag_colab.git

# Add the repository to the Python path
import sys
sys.path.append('/content/new_rag_colab')

## 2. Import Required Modules

In [None]:
import os
from pathlib import Path
import tempfile

# Import RAG modules
from new_rag_colab.processors.pdf_processor import PDFProcessor
from new_rag_colab.processors.text_processor import TextProcessor
from new_rag_colab.processors.json_processor import JSONProcessor
from new_rag_colab.processors.csv_processor import CSVProcessor
from new_rag_colab.chunkers.base_chunker import FixedSizeChunker
from new_rag_colab.utils.embeddings import HuggingFaceEmbeddingProvider
from new_rag_colab.vector_stores.drive_vector_store import DriveVectorStore
from new_rag_colab.retrievers.base_retriever import SimpleRetriever
from new_rag_colab.utils.drive_utils import DriveHandler
from new_rag_colab.utils.colab_rag_pipeline import ColabRAGPipeline

## 3. Mount Google Drive

Let's mount Google Drive to store our vector store:

In [None]:
# Create a Drive handler
drive_handler = DriveHandler(base_folder="RAG_vector_stores")

# Mount Google Drive
drive_handler.mount_drive()

## 4. Create the RAG Pipeline

Now, let's create the RAG pipeline with Google Drive integration:

In [None]:
# Create the embedding provider
embedding_provider = HuggingFaceEmbeddingProvider(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    use_cache=True
)

# Create the vector store with Drive integration
vector_store = DriveVectorStore(
    embedding_function=embedding_provider.get_embedding,
    dimension=384,  # Dimension for all-MiniLM-L6-v2
    drive_handler=drive_handler
)

# Create the chunker
chunker = FixedSizeChunker(chunk_size=1000, chunk_overlap=200)

# Create the retriever
retriever = SimpleRetriever(vector_store, top_k=4)

# Create processors for different file types
processors = {
    "pdf": PDFProcessor(extraction_method="pypdf"),
    "text": TextProcessor(),
    "json": JSONProcessor(),
    "csv": CSVProcessor()
}

# Create the RAG pipeline
rag_pipeline = ColabRAGPipeline(
    chunker=chunker,
    vector_store=vector_store,
    retriever=retriever,
    processors=processors,
    drive_handler=drive_handler,
    use_query_cache=True
)

## 5. Upload and Process Files

Let's upload and process some files:

In [None]:
# Create a file upload widget
from google.colab import files

print("Upload your files (PDF, TXT, JSON, CSV):")
uploaded = files.upload()

# Process uploaded files
for filename in uploaded.keys():
    print(f"Processing {filename}...")
    file_path = Path(filename)
    doc_ids = rag_pipeline.process_file(file_path)
    print(f"Added {len(doc_ids)} chunks from {filename}")

## 6. Save Vector Store to Google Drive

Now, let's save our vector store to Google Drive:

In [None]:
# Create a temporary directory to save the vector store
temp_dir = tempfile.mkdtemp()
vector_store_path = Path(temp_dir) / "vector_store"

# Save the vector store to disk and Google Drive
print("Saving vector store to Google Drive...")
rag_pipeline.save_vector_store(vector_store_path, drive_subfolder="my_vector_store")
print("Vector store saved to Google Drive.")

## 7. List Available Vector Stores in Google Drive

In [None]:
# List available vector stores
vector_stores = rag_pipeline.list_drive_vector_stores()
print("Available vector stores in Google Drive:")
for store in vector_stores:
    print(f"- {store}")

## 8. Load Vector Store from Google Drive

Let's load a vector store from Google Drive:

In [None]:
# Create a new temporary directory
temp_dir = tempfile.mkdtemp()
vector_store_path = Path(temp_dir) / "loaded_vector_store"

# Load the vector store from Google Drive
print("Loading vector store from Google Drive...")
rag_pipeline.load_vector_store(vector_store_path, from_drive=True, drive_path="my_vector_store")
print("Vector store loaded from Google Drive.")
print(f"Loaded {len(rag_pipeline.vector_store.documents)} documents.")

## 9. Query the RAG System

Now, let's query our RAG system:

In [None]:
# Function to query the RAG system
def query_rag(query_text):
    print(f"Query: {query_text}")
    
    # Get markdown response
    markdown_response = rag_pipeline.query_with_markdown(query_text)
    
    # Display the response
    from IPython.display import Markdown
    return Markdown(markdown_response.replace("```markdown\n", "").replace("\n```", ""))

In [None]:
# Try a query
query_rag("What information can you find about machine learning?")

## 10. Interactive Query Interface

Let's create an interactive query interface:

In [None]:
# Create an interactive query interface
from ipywidgets import widgets
from IPython.display import display, clear_output

# Create the text input widget
query_input = widgets.Text(
    value='',
    placeholder='Enter your query here',
    description='Query:',
    layout=widgets.Layout(width='80%')
)

# Create the submit button
submit_button = widgets.Button(
    description='Submit',
    button_style='primary',
    tooltip='Submit query'
)

# Create the output widget
output = widgets.Output()

# Define the submit button callback
def on_submit_button_clicked(b):
    with output:
        clear_output()
        if query_input.value.strip():
            display(query_rag(query_input.value))
        else:
            print("Please enter a query.")

# Register the callback with the button
submit_button.on_click(on_submit_button_clicked)

# Display the widgets
display(widgets.HBox([query_input, submit_button]))
display(output)