<a href="https://colab.research.google.com/github/Kethanvr/Reag-VecotrDB/blob/main/Final_Rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title 1. Install Required Packages
!pip install -q pymongo sentence-transformers google-generativeai langchain langchain-google-genai langchain-community pypdf python-docx openpyxl pandas unstructured pillow langchain-text-splitters
!pip install -q duckduckgo-search beautifulsoup4 requests
!pip install -q rich

print("✅ Dependencies installed successfully.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m56.1 MB/s[0m eta [36m0

In [5]:
# @title 2. Load API Keys & Imports
import os
import hashlib
import numpy as np
import uuid
import json
import requests
from typing import List, Dict, Any, Optional
from io import BytesIO

# ML & AI Imports
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from pymongo import MongoClient
from google.colab import userdata, files

# LangChain Imports
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    PyPDFLoader, Docx2txtLoader, TextLoader,
    UnstructuredExcelLoader, CSVLoader,
    UnstructuredMarkdownLoader, UnstructuredHTMLLoader
)
from langchain_core.documents import Document

# Web & Search Imports
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from PIL import Image

# UI Imports
from rich.console import Console
from rich.panel import Panel
from rich.markdown import Markdown
from rich.table import Table
from rich import box

# Configuration
try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    MONGODB_URI = userdata.get('MONGODB_URI')
    genai.configure(api_key=GEMINI_API_KEY)
    print("✅ API Keys loaded successfully!")
except Exception as e:
    print(f"❌ Error loading keys: {e}")
    print("Please set GEMINI_API_KEY and MONGODB_URI in Colab Secrets.")

✅ API Keys loaded successfully!


In [6]:
# @title 3. Define Advanced RAG Class (Expanded File Support)
class AdvancedRAGWithMemoryVisionWeb:
    def __init__(self, mongodb_uri: str, db_name: str = "rag", collection_name: str = "rag-collection"):
        """Initialize Advanced RAG with Memory, Vision, and Web capabilities"""

        # 1. Load Embedding Model
        print("🔄 Loading Embedding model (BAAI/bge-large-en-v1.5)...")
        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
        self.embedding_dim = 1024

        # 2. Connect to MongoDB
        print("🔄 Connecting to MongoDB Atlas...")
        self.client = MongoClient(mongodb_uri)
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
        self.memory_collection = self.db["conversation_memory"]
        self.important_info_collection = self.db["important_info"]

        # 3. Initialize Gemini
        self.model_name = 'gemini-2.5-flash-lite'
        print(f"🔄 Initializing {self.model_name}...")
        self.llm = genai.GenerativeModel(self.model_name)

        # 4. Text Splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200, length_function=len
        )

        # 5. Session State
        self.session_id = None
        self.conversation_history = []

        print("✅ System Initialized!")

    # --- MEMORY ---
    def start_new_session(self):
        self.session_id = str(uuid.uuid4())
        self.conversation_history = []
        return self.session_id

    def add_to_memory(self, role: str, content: str):
        msg = {"role": role, "content": content, "timestamp": str(uuid.uuid4())}
        self.conversation_history.append(msg)
        if self.session_id:
            self.memory_collection.insert_one({**msg, "session_id": self.session_id})

    def get_context_str(self, last_n=5):
        return "\n".join([f"{m['role']}: {m['content']}" for m in self.conversation_history[-last_n:]])

    # --- VISION ---
    def analyze_uploaded_image(self, uploaded_file_dict, question="Describe this image"):
        try:
            filename = list(uploaded_file_dict.keys())[0]
            image_data = uploaded_file_dict[filename]
            image = Image.open(BytesIO(image_data))
            response = self.llm.generate_content([question, image])
            return response.text
        except Exception as e:
            return f"Error processing image: {str(e)}"

    # --- WEB ---
    def web_search_and_scrape(self, query: str, max_results=3):
        try:
            ddgs = DDGS()
            results = list(ddgs.text(query, max_results=max_results))
            if not results: return "No web results found."

            context = []
            for r in results:
                try:
                    resp = requests.get(r['href'], timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
                    soup = BeautifulSoup(resp.content, 'html.parser')
                    text = soup.get_text(strip=True)[:1000]
                    context.append(f"Source: {r['title']}\nURL: {r['href']}\nContent: {text}")
                except:
                    continue
            return "\n\n".join(context)
        except Exception as e:
            return f"Web search error: {str(e)}"

    # --- INGESTION (UPDATED FOR MORE FILE TYPES) ---
    def ingest_file(self, file_path: str, metadata: Dict = None):
        ext = os.path.splitext(file_path)[1].lower()

        # Define loaders for different types
        if ext == '.pdf':
            loader = PyPDFLoader(file_path)
        elif ext in ['.docx', '.doc']:
            loader = Docx2txtLoader(file_path)
        elif ext == '.csv':
            loader = CSVLoader(file_path)
        elif ext == '.md':
            # Try UnstructuredMarkdown if available, else fallback to Text
            try:
                loader = UnstructuredMarkdownLoader(file_path)
            except:
                loader = TextLoader(file_path)
        elif ext in ['.html', '.htm']:
            loader = UnstructuredHTMLLoader(file_path)
        elif ext in ['.txt', '.json', '.xml', '.py', '.js', '.java', '.c', '.cpp', '.yaml', '.yml', '.ini', '.log']:
            # Generic Text Fallback for code and data files
            loader = TextLoader(file_path)
        else:
            print(f"⚠️ Skipping unsupported file: {os.path.basename(file_path)}")
            return 0

        try:
            docs = loader.load()
            if not docs: return 0

            chunks = self.text_splitter.split_documents(docs)

            new_chunks = 0
            for i, chunk in enumerate(chunks):
                chunk_hash = hashlib.sha256(chunk.page_content.encode()).hexdigest()

                if self.collection.find_one({"hash": chunk_hash}):
                    continue

                doc = {
                    "text": chunk.page_content,
                    "embedding": self.embedding_model.encode(chunk.page_content).tolist(),
                    "hash": chunk_hash,
                    "metadata": {**(metadata or {}), "chunk_index": i}
                }
                self.collection.insert_one(doc)
                new_chunks += 1
            return new_chunks
        except Exception as e:
            print(f"❌ Error ingesting {file_path}: {e}")
            return 0

    # --- RETRIEVAL & GENERATION ---
    def generate_answer(self, query: str, use_web=False):
        self.add_to_memory("user", query)

        ctx_sources = []
        web_content = ""

        q_emb = self.embedding_model.encode(query).tolist()
        try:
            results = list(self.collection.aggregate([
                {"$vectorSearch": {
                    "index": "vector_index",
                    "path": "embedding",
                    "queryVector": q_emb,
                    "numCandidates": 50,
                    "limit": 3
                }},
                {"$project": {"_id": 0, "text": 1, "metadata": 1, "score": {"$meta": "vectorSearchScore"}}}
            ]))
            ctx_sources = results
        except Exception as e:
            print(f"⚠️ Vector Search failed: {e}")

        if use_web or "latest" in query.lower() or "news" in query.lower():
            web_content = self.web_search_and_scrape(query)

        doc_text = "\n\n".join([f"[Doc Source: {r['metadata'].get('source')}]\n{r['text']}" for r in ctx_sources])
        history = self.get_context_str()

        prompt = f"""
        You are an advanced AI assistant. Use the following context to answer the user's question.

        Conversation History:
        {history}

        Document Context:
        {doc_text}

        Web Context:
        {web_content}

        User Question: {query}
        """

        response = self.llm.generate_content(prompt)
        answer = response.text
        self.add_to_memory("assistant", answer)
        return {
            "answer": answer,
            "sources": ctx_sources,
            "web_used": bool(web_content)
        }

# Re-initialize
rag = AdvancedRAGWithMemoryVisionWeb(
    mongodb_uri=MONGODB_URI,
    db_name="rag",
    collection_name="rag-collection"
)

🔄 Loading Embedding model (BAAI/bge-large-en-v1.5)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

🔄 Connecting to MongoDB Atlas...
🔄 Initializing gemini-2.5-flash-lite...
✅ System Initialized!


In [7]:
# @title 4. Initialize & Create Indexes
# Initialize
rag = AdvancedRAGWithMemoryVisionWeb(
    mongodb_uri=MONGODB_URI,
    db_name="rag",
    collection_name="rag-collection"
)

# Create Hash Index for Deduplication (Runs once)
print("🔧 Ensuring hash index exists...")
try:
    rag.collection.create_index("hash", unique=True)
    print("✅ Hash index verified!")
except Exception as e:
    print(f"Note: {e}")

rag.start_new_session()

🔄 Loading Embedding model (BAAI/bge-large-en-v1.5)...
🔄 Connecting to MongoDB Atlas...
🔄 Initializing gemini-2.5-flash-lite...
✅ System Initialized!
🔧 Ensuring hash index exists...
✅ Hash index verified!


'8bdd68d4-c212-4543-a413-488a873dff99'

In [8]:
# @title 5. Upload & Ingest Documents (FIXED)
print("📤 Upload documents (PDF, DOCX, TXT, CSV)")
uploaded = files.upload()

total_chunks = 0

for filename, content in uploaded.items():
    print(f"Processing {filename}...")

    # Write to temp file
    temp_path = f"/tmp/{filename}"
    with open(temp_path, 'wb') as f:
        f.write(content)

    # Ingest
    chunks_added = rag.ingest_file(temp_path, metadata={"source": filename})

    # Safety Check: Ensure chunks_added is a number before adding
    if chunks_added is not None:
        total_chunks += chunks_added
    else:
        print(f"⚠️ Warning: No chunks returned for {filename}")

    # Cleanup
    if os.path.exists(temp_path):
        os.remove(temp_path)

print(f"\n✅ Ingestion Complete! Added {total_chunks} new chunks.")

📤 Upload documents (PDF, DOCX, TXT, CSV)


Saving tB8Y50o5sGg6lKsa Kethanvr.txt to tB8Y50o5sGg6lKsa Kethanvr.txt
Processing tB8Y50o5sGg6lKsa Kethanvr.txt...

✅ Ingestion Complete! Added 0 new chunks.


In [None]:
# @title 6. Interactive Chat Interface
console = Console()

console.print(Panel.fit(
    "[bold cyan]Advanced RAG System[/bold cyan]\n"
    "[dim]Commands:[/dim]\n"
    "• [green]Any question[/green]: Search docs + memory\n"
    "• [green]web: <query>[/green]: Force web search\n"
    "• [green]image[/green]: Upload and analyze image\n"
    "• [green]exit[/green]: Quit",
    title="🚀 Ready", border_style="cyan"
))

while True:
    try:
        query = console.input("\n[bold yellow]User 👤:[/bold yellow] ").strip()

        if query.lower() in ['exit', 'quit', 'q']:
            console.print("[bold red]👋 Goodbye![/bold red]")
            break

        if not query: continue

        # --- IMAGE MODE ---
        if query.lower() == 'image':
            console.print("[yellow]📤 Upload an image...[/yellow]")
            img_upload = files.upload()
            if img_upload:
                q_img = console.input("[bold magenta]Question about image:[/bold magenta] ")
                with console.status("[bold green]Analyzing Image...[/bold green]"):
                    ans = rag.analyze_uploaded_image(img_upload, q_img)
                console.print(Panel(Markdown(ans), title="🖼️ Image Analysis", border_style="magenta"))
            continue

        # --- TEXT/WEB MODE ---
        use_web = False
        if query.startswith("web:"):
            use_web = True
            query = query.replace("web:", "").strip()

        with console.status("[bold green]Thinking...[/bold green]"):
            result = rag.generate_answer(query, use_web=use_web)

        # Display Answer
        console.print(Panel(
            Markdown(result['answer']),
            title="🤖 AI Response",
            border_style="green",
            box=box.ROUNDED
        ))

        # Display Sources
        if result['sources']:
            table = Table(title="📚 Sources Used", box=box.SIMPLE)
            table.add_column("Score", style="cyan")
            table.add_column("Source File", style="magenta")
            table.add_column("Snippet", style="dim")

            for s in result['sources']:
                table.add_row(
                    f"{s.get('score', 0):.2f}",
                    s['metadata'].get('source', 'unknown'),
                    s['text'][:60].replace("\n", " ") + "..."
                )
            console.print(table)

        if result['web_used']:
            console.print("[dim]🌐 Web content was used to answer this.[/dim]")

    except KeyboardInterrupt:
        break
    except Exception as e:
        console.print(f"[red]Error: {e}[/red]")

who is modi


Output()

who is kethan


Output()

kethan


Output()

what is mediscan


Output()