In [1]:
# RAG Chatbot with PDF Support for Google Colab
# Run each cell in sequence

# Cell 1: Install required packages (including PDF support)
!pip install sentence-transformers faiss-cpu groq PyPDF2 python-docx

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting groq
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading groq-0.31.0-py3-none-any.whl (131 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.4/131.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl

In [3]:
# Cell 2: Import libraries
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import os
from groq import Groq
from IPython.display import display, HTML, clear_output
import PyPDF2
import docx
import io
import re

# TODO: Enter your Groq API key here
GROQ_API_KEY = ""  # Get free API key from https://console.groq.com/

# Initialize models
print("Loading models...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
groq_client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None
print("✅ Models loaded!")


Loading models...
✅ Models loaded!


In [4]:

# Cell 3: Enhanced Vector Store with Document Processing
class DocumentVectorStore:
    def __init__(self):
        self.texts = []
        self.embeddings = []
        self.metadata = []  # Store document info

    def add_texts(self, texts, source="manual"):
        """Add texts to the store"""
        if isinstance(texts, str):
            texts = [texts]

        for text in texts:
            # Split long texts into chunks
            chunks = self._split_text(text)
            for chunk in chunks:
                if len(chunk.strip()) > 10:  # Only add meaningful chunks
                    embedding = sentence_model.encode([chunk])[0]
                    self.texts.append(chunk)
                    self.embeddings.append(embedding)
                    self.metadata.append({"source": source, "length": len(chunk)})

        print(f"✅ Added {len([t for t in texts if len(t.strip()) > 10])} documents from {source}")
        print(f"📊 Total chunks in knowledge base: {len(self.texts)}")

    def _split_text(self, text, chunk_size=500):
        """Split text into smaller chunks"""
        # Split by sentences first
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            sentence = sentence.strip()
            if len(current_chunk) + len(sentence) < chunk_size:
                current_chunk += sentence + ". "
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks if chunks else [text]

    def search(self, query, k=3):
        """Search for similar texts"""
        if not self.embeddings:
            return []

        query_embedding = sentence_model.encode([query])[0]

        # Calculate cosine similarities
        similarities = []
        for i, doc_embedding in enumerate(self.embeddings):
            similarity = np.dot(query_embedding, doc_embedding) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
            )
            similarities.append((similarity, i))

        # Get top k results
        similarities.sort(reverse=True)
        results = []
        for sim, idx in similarities[:k]:
            results.append({
                'text': self.texts[idx],
                'similarity': sim,
                'metadata': self.metadata[idx]
            })

        return results

In [5]:

# Cell 4: Document Processing Functions
def extract_text_from_pdf(pdf_content):
    """Extract text from PDF bytes"""
    try:
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

def extract_text_from_docx(docx_content):
    """Extract text from DOCX bytes"""
    try:
        doc = docx.Document(io.BytesIO(docx_content))
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text.strip()
    except Exception as e:
        return f"Error reading DOCX: {str(e)}"

def process_uploaded_file(file_content, filename):
    """Process uploaded file based on extension"""
    file_ext = filename.lower().split('.')[-1]

    if file_ext == 'pdf':
        return extract_text_from_pdf(file_content)
    elif file_ext in ['docx', 'doc']:
        return extract_text_from_docx(file_content)
    elif file_ext == 'txt':
        return file_content.decode('utf-8')
    else:
        return f"Unsupported file type: {file_ext}"


In [6]:


# Cell 5: Initialize Knowledge Base
vector_store = DocumentVectorStore()

# Sample knowledge base
sample_knowledge = [
    "Python is a high-level programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991.",
    "Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming for every task.",
    "Streamlit is an open-source Python library for creating web applications for data science and machine learning projects.",
    "RAG (Retrieval Augmented Generation) combines information retrieval with text generation for better AI responses.",
    "Vector databases store high-dimensional vectors and enable similarity search for AI applications like semantic search.",
    "Natural Language Processing helps computers understand and work with human language through various algorithms.",
    "Deep learning uses neural networks with multiple layers to learn complex patterns from large amounts of data.",
    "APIs allow different software applications to communicate and share data with each other seamlessly.",
    "Data science combines statistics, programming, and domain knowledge to extract insights from structured and unstructured data.",
    "Cloud computing provides on-demand access to computing resources over the internet without local infrastructure."
]

vector_store.add_texts(sample_knowledge, source="initial_knowledge")


✅ Added 10 documents from initial_knowledge
📊 Total chunks in knowledge base: 10


In [7]:



# Cell 6: Enhanced RAG Functions
def search_knowledge_base(query):
    """Search the knowledge base for relevant information"""
    results = vector_store.search(query, k=3)
    if results:
        context_parts = []
        for result in results:
            source = result['metadata']['source']
            context_parts.append(f"[From {source}] {result['text']}")
        return "\n\n".join(context_parts)
    return "No relevant information found."

def generate_response(query, context):
    """Generate response using Groq API"""
    if not groq_client:
        return "❌ Please add your Groq API key!"

    prompt = f"""You are a helpful assistant. Answer the user's question based on the provided context. If the answer is not in the context, say "I don't have enough information to answer that accurately."

Context from knowledge base:
{context}

Question: {query}

Answer:"""

    try:
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama3-8b-8192",  # Updated working model
            temperature=0.7,
            max_tokens=500
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

def chat_with_rag(query):
    """Main RAG chat function"""
    print(f"🔍 Searching knowledge base for: {query}")

    # Search for relevant context
    context = search_knowledge_base(query)
    print(f"📚 Found relevant information from knowledge base...")

    # Generate response
    print("🤖 Generating response...")
    response = generate_response(query, context)

    return response

In [8]:

# Cell 7: Document Addition Functions
def add_text(text, source_name="manual_input"):
    """Add plain text to knowledge base"""
    vector_store.add_texts(text, source=source_name)

def add_pdf_from_upload():
    """Instructions for adding PDF in Colab"""
    print("📄 To add a PDF file:")
    print("1. Upload your PDF to Colab using the file browser (left sidebar)")
    print("2. Use: add_pdf_file('your_file.pdf')")
    print("3. Or drag & drop and copy the file path")

def add_pdf_file(file_path):
    """Add PDF file to knowledge base"""
    try:
        with open(file_path, 'rb') as file:
            content = file.read()

        text = extract_text_from_pdf(content)
        if "Error" not in text:
            vector_store.add_texts(text, source=f"PDF: {file_path}")
            return f"✅ Successfully added PDF: {file_path}"
        else:
            return text
    except Exception as e:
        return f"❌ Error reading file: {str(e)}"

def add_text_file(file_path):
    """Add text file to knowledge base"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        vector_store.add_texts(content, source=f"TXT: {file_path}")
        return f"✅ Successfully added text file: {file_path}"
    except Exception as e:
        return f"❌ Error reading file: {str(e)}"


In [9]:


# Cell 8: Interactive Chat Function
def start_chat():
    """Start interactive chat session"""
    if not GROQ_API_KEY:
        print("❌ Please add your Groq API key first!")
        return

    print("💬 RAG Chat started! (type 'quit' to exit)")
    print("📚 Ask questions about the knowledge base or upload documents first")
    print("-" * 50)

    while True:
        query = input("\n🧑 You: ").strip()

        if query.lower() in ['quit', 'exit', 'bye']:
            print("👋 Goodbye!")
            break

        if not query:
            continue

        print("🤖 Bot: ", end="")
        response = chat_with_rag(query)
        print(response)
        print("-" * 50)


In [10]:

# Cell 9: Usage Examples and Instructions
print("🚀 Enhanced RAG Chatbot with PDF Support Ready!")
print("=" * 60)

if not GROQ_API_KEY:
    print("❌ Please add your Groq API key in GROQ_API_KEY variable!")
else:
    print("✅ API key loaded!")

print("\n📋 HOW TO ADD DOCUMENTS:")
print("-" * 30)
print("1. TEXT: add_text('Your text here', 'source_name')")
print("2. PDF:  add_pdf_file('path/to/your/file.pdf')")
print("3. TXT:  add_text_file('path/to/your/file.txt')")

print("\n💡 EXAMPLES:")
print("-" * 15)
print("# Add custom text:")
print("add_text('Artificial intelligence is transforming healthcare.', 'healthcare_doc')")
print("\n# Add PDF (after uploading to Colab):")
print("add_pdf_file('/content/my_document.pdf')")

print("\n🤔 TRY ASKING:")
print("- What is Python?")
print("- Explain machine learning")
print("- Tell me about [your uploaded document topic]")
print("\n" + "=" * 60)


🚀 Enhanced RAG Chatbot with PDF Support Ready!
✅ API key loaded!

📋 HOW TO ADD DOCUMENTS:
------------------------------
1. TEXT: add_text('Your text here', 'source_name')
2. PDF:  add_pdf_file('path/to/your/file.pdf')
3. TXT:  add_text_file('path/to/your/file.txt')

💡 EXAMPLES:
---------------
# Add custom text:
add_text('Artificial intelligence is transforming healthcare.', 'healthcare_doc')

# Add PDF (after uploading to Colab):
add_pdf_file('/content/my_document.pdf')

🤔 TRY ASKING:
- What is Python?
- Explain machine learning
- Tell me about [your uploaded document topic]



In [11]:

# Cell 10: Quick Test
print("🧪 Quick Test:")
response = chat_with_rag("What is Python?")
print(f"Response: {response}")



🧪 Quick Test:
🔍 Searching knowledge base for: What is Python?
📚 Found relevant information from knowledge base...
🤖 Generating response...
Response: According to the context, Python is a high-level programming language known for its simplicity and readability, created by Guido van Rossum and first released in 1991.


In [14]:
#adding the pdf that we want, can ask questions if loaded properly
add_pdf_file("/content/Placement_Manual.pdf")

✅ Added 1 documents from PDF: /content/Placement_Manual.pdf
📊 Total chunks in knowledge base: 28


'✅ Successfully added PDF: /content/Placement_Manual.pdf'

In [15]:
# Cell 11: Start chatting (run this to begin)

start_chat()


💬 RAG Chat started! (type 'quit' to exit)
📚 Ask questions about the knowledge base or upload documents first
--------------------------------------------------

🧑 You: what the pdf says about linear regression?
🤖 Bot: 🔍 Searching knowledge base for: what the pdf says about linear regression?
📚 Found relevant information from knowledge base...
🤖 Generating response...
According to the PDF, it is mentioned that one project is required on Linear Regression, where you need to "do every statistical thing you know about linear regression".
--------------------------------------------------


KeyboardInterrupt: Interrupted by user