In [1]:
# RAG Chatbot with PDF Support - Complete Implementation for Google Colab
# This notebook creates a chatbot that can answer questions based on uploaded PDF documents

# =============================================================================
# 1. INSTALLATION AND SETUP
# =============================================================================

# Install required packages
!pip install -q transformers torch accelerate bitsandbytes
!pip install -q sentence-transformers
!pip install -q chromadb
!pip install -q PyPDF2 pypdf
!pip install -q gradio
!pip install -q langchain langchain-community

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import PyPDF2
import gradio as gr
import re
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# 2. PDF PROCESSING CLASS
# =============================================================================

class PDFProcessor:
    """Handle PDF upload and text extraction"""

    def __init__(self):
        self.documents = []
        self.chunks = []

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF file"""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                return text
        except Exception as e:
            return f"Error reading PDF: {str(e)}"

    def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
        """Split text into overlapping chunks"""
        # Clean text
        text = re.sub(r'\s+', ' ', text).strip()

        words = text.split()
        chunks = []

        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)

        return chunks

    def process_pdf(self, pdf_path: str) -> List[str]:
        """Process PDF and return text chunks"""
        text = self.extract_text_from_pdf(pdf_path)
        chunks = self.chunk_text(text)
        self.chunks.extend(chunks)
        return chunks

# =============================================================================
# 3. VECTOR DATABASE CLASS
# =============================================================================

class VectorDatabase:
    """Handle vector embeddings and similarity search"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.embedding_model = SentenceTransformer(model_name)
        self.client = chromadb.Client(Settings(allow_reset=True))
        self.collection = None

    def create_collection(self, collection_name: str = "pdf_documents"):
        """Create or get collection"""
        try:
            self.collection = self.client.get_collection(collection_name)
        except:
            self.collection = self.client.create_collection(collection_name)

    def add_documents(self, chunks: List[str]):
        """Add document chunks to vector database"""
        if not self.collection:
            self.create_collection()

        # Generate embeddings
        embeddings = self.embedding_model.encode(chunks).tolist()

        # Add to collection
        ids = [f"chunk_{i}" for i in range(len(chunks))]

        self.collection.add(
            embeddings=embeddings,
            documents=chunks,
            ids=ids
        )

        return f"Added {len(chunks)} chunks to vector database"

    def similarity_search(self, query: str, n_results: int = 3) -> List[str]:
        """Search for similar documents"""
        if not self.collection:
            return []

        query_embedding = self.embedding_model.encode([query]).tolist()

        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=n_results
        )

        return results['documents'][0] if results['documents'] else []

# =============================================================================
# 4. LLM HANDLER CLASS
# =============================================================================

class LLMHandler:
    """Handle open-source LLM for response generation"""

    def __init__(self, model_name: str = "microsoft/DialoGPT-small"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.load_model()

    def load_model(self):
        """Load the language model"""
        try:
            print(f"Loading model: {self.model_name}")

            # Try to load the specified model first
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side="left")

                # Add pad token if not present
                if self.tokenizer.pad_token is None:
                    self.tokenizer.pad_token = self.tokenizer.eos_token

                # For Colab, use CPU-friendly settings
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    device_map="auto" if torch.cuda.is_available() else None,
                    low_cpu_mem_usage=True
                )

                print("Model loaded successfully!")

            except Exception as e:
                print(f"Failed to load {self.model_name}, falling back to distilgpt2: {e}")
                # Fallback to a more reliable model
                self.model_name = "distilgpt2"
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
                self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
                if self.tokenizer.pad_token is None:
                    self.tokenizer.pad_token = self.tokenizer.eos_token
                print("Fallback model loaded successfully!")

        except Exception as e:
            print(f"Error loading any model: {e}")
            raise e

    def generate_response(self, context: str, query: str, max_length: int = 200) -> str:
        """Generate response using context and query"""
        try:
            # Create a more structured prompt
            prompt = f"""Based on the following context, please answer the question concisely.

Context: {context[:800]}

Question: {query}

Answer:"""

            # Tokenize with proper truncation
            inputs = self.tokenizer.encode(
                prompt,
                return_tensors="pt",
                max_length=900,
                truncation=True
            )

            # Generate with better parameters to avoid repetition
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=max_length,  # Use max_new_tokens instead of max_length
                    num_return_sequences=1,
                    temperature=0.3,  # Lower temperature for more focused responses
                    do_sample=True,
                    top_p=0.9,  # Add nucleus sampling
                    repetition_penalty=1.2,  # Penalize repetition
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    attention_mask=torch.ones_like(inputs),
                    no_repeat_ngram_size=3  # Prevent 3-gram repetition
                )

            # Decode response
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract only the generated part after "Answer:"
            if "Answer:" in response:
                response = response.split("Answer:")[-1].strip()
            else:
                response = response[len(prompt):].strip()

            # Clean up the response
            response = self.clean_response(response)

            return response if response else "I'm sorry, I couldn't find relevant information to answer your question."

        except Exception as e:
            return f"Error generating response: {str(e)}"

    def clean_response(self, response: str) -> str:
        """Clean and format the response"""
        # Remove repetitive patterns
        lines = response.split('\n')
        cleaned_lines = []

        for line in lines:
            line = line.strip()
            if line and line not in cleaned_lines[-3:]:  # Avoid recent repetitions
                cleaned_lines.append(line)

        response = '\n'.join(cleaned_lines)

        # Stop at first complete sentence or reasonable stopping point
        sentences = response.split('. ')
        if len(sentences) > 1:
            # Keep first 2-3 sentences for a complete answer
            response = '. '.join(sentences[:3])
            if not response.endswith('.'):
                response += '.'

        # Remove common repetitive patterns
        repetitive_patterns = [
            r'(\b\w+\b)(\s+\1){2,}',  # Remove word repetitions
            r'(Answer:\s*){2,}',       # Remove repeated "Answer:"
            r'(Question:\s*){2,}',     # Remove repeated "Question:"
        ]

        for pattern in repetitive_patterns:
            response = re.sub(pattern, r'\1', response, flags=re.IGNORECASE)

        return response.strip()

# =============================================================================
# 5. RAG CHATBOT CLASS
# =============================================================================

class RAGChatbot:
    """Main RAG Chatbot class"""

    def __init__(self):
        self.pdf_processor = PDFProcessor()
        self.vector_db = VectorDatabase()
        self.llm_handler = LLMHandler()
        self.conversation_history = []

    def upload_pdf(self, pdf_file) -> str:
        """Handle PDF upload"""
        try:
            if pdf_file is None:
                return "Please upload a PDF file."

            # Process PDF
            chunks = self.pdf_processor.process_pdf(pdf_file.name)

            # Add to vector database
            result = self.vector_db.add_documents(chunks)

            return f"✅ PDF processed successfully! {result}"

        except Exception as e:
            return f"❌ Error processing PDF: {str(e)}"

    def chat(self, message: str, history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]:
        """Handle chat interaction"""
        try:
            if not message.strip():
                return "", history

            # Check if asking about the file itself
            if any(phrase in message.lower() for phrase in ['what is the file', 'what file', 'which file']):
                if not self.pdf_processor.chunks:
                    response = "No PDF file has been uploaded yet. Please upload a PDF document first."
                else:
                    # Get first chunk to identify the document
                    first_chunk = self.pdf_processor.chunks[0][:300]
                    response = f"The uploaded document appears to contain information about: {first_chunk}... \n\nPlease ask specific questions about the content to get more detailed answers."
            else:
                # Search for relevant context
                relevant_docs = self.vector_db.similarity_search(message, n_results=3)

                if not relevant_docs:
                    response = "I don't have any relevant information from the uploaded documents to answer your question. Please make sure you've uploaded a PDF file first, or try rephrasing your question."
                else:
                    # Combine relevant documents as context
                    context = "\n\n".join(relevant_docs[:2])  # Use top 2 results

                    # Generate response using RAG
                    response = self.llm_handler.generate_response(context, message)

                    # Add source information
                    response += f"\n\n📄 *Based on information from the uploaded document.*"

            # Update history
            history.append((message, response))

            return "", history

        except Exception as e:
            error_response = f"Sorry, I encountered an error: {str(e)}"
            history.append((message, error_response))
            return "", history

# =============================================================================
# 6. GRADIO INTERFACE
# =============================================================================

def create_interface():
    """Create Gradio interface"""

    # Initialize chatbot
    chatbot = RAGChatbot()

    with gr.Blocks(title="RAG PDF Chatbot", theme=gr.themes.Soft()) as interface:
        gr.Markdown("""
        # 🤖 RAG PDF Chatbot

        Upload a PDF document and ask questions about its content!
        This chatbot uses Retrieval-Augmented Generation (RAG) to provide accurate answers based on your documents.

        **Instructions:**
        1. Upload a PDF file using the file uploader
        2. Wait for the "✅ PDF processed successfully!" message
        3. Start asking questions about the document content
        """)

        with gr.Row():
            with gr.Column(scale=1):
                pdf_upload = gr.File(
                    label="📄 Upload PDF Document",
                    file_types=[".pdf"],
                    type="filepath"
                )
                upload_status = gr.Textbox(
                    label="Upload Status",
                    interactive=False,
                    lines=2
                )

                # Process PDF when uploaded
                pdf_upload.change(
                    fn=chatbot.upload_pdf,
                    inputs=[pdf_upload],
                    outputs=[upload_status]
                )

            with gr.Column(scale=2):
                chatbot_interface = gr.Chatbot(
                    label="💬 Chat with your PDF",
                    height=500,
                    show_copy_button=True
                )

                msg_input = gr.Textbox(
                    label="Your Message",
                    placeholder="Ask a question about your PDF document...",
                    lines=2
                )

                with gr.Row():
                    send_btn = gr.Button("📤 Send", variant="primary")
                    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")

        # Event handlers
        def respond(message, history):
            return chatbot.chat(message, history)

        # Send message on button click or Enter
        send_btn.click(
            fn=respond,
            inputs=[msg_input, chatbot_interface],
            outputs=[msg_input, chatbot_interface]
        )

        msg_input.submit(
            fn=respond,
            inputs=[msg_input, chatbot_interface],
            outputs=[msg_input, chatbot_interface]
        )

        # Clear chat
        clear_btn.click(
            fn=lambda: [],
            outputs=[chatbot_interface]
        )

        gr.Markdown("""
        ---
        ### 🔧 Technical Details:
        - **LLM**: Open-source language model (DialoGPT/DistilGPT2)
        - **Embeddings**: SentenceTransformers (all-MiniLM-L6-v2)
        - **Vector DB**: ChromaDB for similarity search
        - **PDF Processing**: PyPDF2 for text extraction

        ### 💡 Tips:
        - Ask specific questions about the document content
        - The chatbot works best with well-formatted PDF documents
        - Try different phrasings if you don't get the expected answer
        """)

    return interface

# =============================================================================
# 7. MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    print("🚀 Initializing RAG PDF Chatbot...")
    print("📦 This may take a few minutes to load the models...")

    # Create and launch interface
    interface = create_interface()

    # Launch with public sharing for Colab
    interface.launch(
        share=True,  # Creates public link for Colab
        debug=True,
        server_name="0.0.0.0",
        server_port=7860
    )

# =============================================================================
# 8. ALTERNATIVE: COMMAND LINE VERSION
# =============================================================================

def run_cli_version():
    """Run command line version of the chatbot"""
    print("\n" + "="*50)
    print("RAG PDF Chatbot - Command Line Version")
    print("="*50)

    chatbot = RAGChatbot()

    # Upload PDF
    pdf_path = input("\nEnter the path to your PDF file: ").strip()
    if os.path.exists(pdf_path):
        result = chatbot.upload_pdf(type('', (), {'name': pdf_path})())
        print(f"\n{result}")
    else:
        print("PDF file not found!")
        return

    # Chat loop
    print("\n💬 You can now ask questions about your PDF. Type 'quit' to exit.\n")

    history = []
    while True:
        question = input("You: ").strip()

        if question.lower() in ['quit', 'exit', 'bye']:
            print("Goodbye! 👋")
            break

        if question:
            _, history = chatbot.chat(question, history)
            if history:
                print(f"Bot: {history[-1][1]}\n")

# Uncomment the line below to run CLI version instead of Gradio
# run_cli_version()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading model: microsoft/DialoGPT-small


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded successfully!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://80a21f2ee3313731d6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 0.0.0.0:7860 <> https://80a21f2ee3313731d6.gradio.live
