# PersonalRAG 

A streamlined RAG system for your personal documents.


In [1]:
# Imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import numpy as np



# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain



# PDF processing
import pdfplumber


  from .autonotebook import tqdm as notebook_tqdm
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6
python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6


In [2]:
# Complete RAG System Setup
# =========================

# Configuration
MODEL = 'gpt-5-nano'
db_name = 'vector_db'

# Load environment
load_dotenv()

# Validate environment
required_vars = [
    'AZURE_OPENAI_ENDPOINT', 'AZURE_OPENAI_API_KEY',
    'AZURE_OPENAI_API_VERSION',
    'AZURE_OPENAI_EMBEDDING_DEPLOYMENT',
    
    
    'AZURE_CHATOPENAI_DEPLOYMENT',
    'AZURE_CHATOPENAI_ENDPOINT', 'AZURE_CHATOPENAI_API_KEY',
    'AZURE_CHATOPENAI_API_VERSION'
]
missing = [var for var in required_vars if not os.getenv(var)]
if missing:
    print(f"❌ Missing: {missing}")
    env_ok = False
else:
    print("✅ Environment OK")
    env_ok = True

# Initialize embeddings and LLM
if env_ok:
    embeddings = AzureOpenAIEmbeddings(
        model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION")
    )

    llm = AzureChatOpenAI(
        azure_deployment=os.getenv('AZURE_CHATOPENAI_DEPLOYMENT'),
        azure_endpoint=os.getenv('AZURE_CHATOPENAI_ENDPOINT'),
        api_key=os.getenv('AZURE_CHATOPENAI_API_KEY'),
        api_version=os.getenv('AZURE_CHATOPENAI_API_VERSION'),
    )
    print("✅ Embeddings and LLM initialized")
else:
    print("❌ Cannot initialize - check environment variables")
    embeddings = None
    llm = None

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6


✅ Environment OK
✅ Embeddings and LLM initialized


In [3]:
# Load and Process Documents
# ==========================

# Check PDF processing availability
try:
    import pdfplumber
    PDF_AVAILABLE = True
    print("✅ PDF processing available (pdfplumber)")
except ImportError:
    try:
        import PyPDF2
        PDF_AVAILABLE = True
        print("✅ PDF processing available (PyPDF2)")
    except ImportError:
        PDF_AVAILABLE = False
        print("⚠️ PDF processing not available - install pdfplumber or PyPDF2")

# Auto-convert PDFs to markdown
for root, dirs, files in os.walk("my-knowledge-worker-data"):
    for file in files:
        if file.lower().endswith('.pdf'):
            pdf_path = os.path.join(root, file)
            md_path = pdf_path.rsplit('.', 1)[0] + '.md'
            
            if not os.path.exists(md_path):
                try:
                    with pdfplumber.open(pdf_path) as pdf:
                        text = "\n\n".join([p.extract_text() or "" for p in pdf.pages])
                    if text.strip():
                        with open(md_path, 'w', encoding='utf-8') as f:
                            f.write(f"# {os.path.splitext(file)[0]}\n\n{text}")
                        print(f"✅ Converted: {file}")
                except Exception as e:
                    print(f"❌ Error: {file} - {e}")

def load_all_documents():
    """Load all markdown files from my-knowledge-worker-data"""
    documents = []
    
    for folder in glob.glob("my-knowledge-worker-data/*"):
        doc_type = os.path.basename(folder)
        loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader)
        folder_docs = loader.load()
        
        for doc in folder_docs:
            doc.metadata["doc_type"] = doc_type
        documents.extend(folder_docs)
    
    return documents

# Load documents
documents = load_all_documents()
print(f"📄 Loaded {len(documents)} documents")

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
chunks = text_splitter.split_documents(documents)
print(f"📝 Created {len(chunks)} chunks")


✅ PDF processing available (pdfplumber)
📄 Loaded 31 documents
📝 Created 96 chunks


In [4]:
import shutil

# Smart vector database initialization
if os.path.exists(db_name):
    try:
        vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)
        print(f"✅ Loaded existing database ({vectorstore._collection.count()} documents)")
    except:
        shutil.rmtree(db_name)
        vectorstore = None
else:
    vectorstore = None

# To force rebuild: shutil.rmtree(db_name) if os.path.exists(db_name) else None

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6


✅ Loaded existing database (83 documents)


In [5]:
# Create vector database if needed
if vectorstore is None:
    vectorstore = Chroma.from_documents(
        documents=chunks, 
        embedding=embeddings, 
        persist_directory=db_name
    )
    print(f"✅ Created new database ({vectorstore._collection.count()} documents)")




In [6]:
# Configuration Settings
MODEL = 'gpt-5-nano'

# Vector Database Configuration
import os
from datetime import datetime

# Option 1: Simple name (current approach)
db_name = 'vector_db'

# Option 2: Timestamped database (uncomment to use)
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# db_name = f'vector_db_{timestamp}'

# Option 3: Environment variable with fallback (uncomment to use)
# db_name = os.getenv('VECTOR_DB_NAME', 'vector_db')

# Option 4: Full path configuration (uncomment to use)
# db_name = os.path.join('data', 'vector_databases', 'personal_knowledge_db')

print(f"Vector database will be stored as: {db_name}")

Vector database will be stored as: vector_db


In [7]:
#load environment variables
load_dotenv()


python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 6


True

In [8]:
folders = glob.glob("my-knowledge-worker-data/*")



def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc
    # Adds a 'doc_type' field to the document's metadata and returns the modified document


text_loader_kwargs = {"encoding": "utf-8"}



documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader= DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    temp_docs = loader.load()
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)





In [9]:
print(f"Total number of chunks: {len(chunks)}")

Total number of chunks: 88


In [10]:
print(f"Document types found: {set([doc.metadata['doc_type'] for doc in documents])}")

Document types found: {'projects', 'documents'}


In [11]:

# Alternative Embeddings Options
# ================================

# you can use these alternatives:

# Option 1: HuggingFace Embeddings (Free, Local)
# Uncomment the lines below to use HuggingFace embeddings instead:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")




# Option 2: OpenAI Embeddings (if you have OpenAI API key)
# from langchain_openai import OpenAIEmbeddings
# os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
# embeddings = OpenAIEmbeddings(openai_api_key="your-openai-api-key")

# Option 3: Azure OpenAI Embeddings
embeddings = AzureOpenAIEmbeddings(
    model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION")
)
# Usage Instructions:
# 1. If Azure OpenAI embeddings failed, uncomment the HuggingFace lines above
# 2. Or call: embeddings = get_alternative_embeddings()
# 3. Make sure to install: pip install sentence-transformers

In [12]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [13]:
sample_embedding

array([-0.00539062, -0.00995838,  0.01023466, ..., -0.00820455,
       -0.00697327, -0.00131612], shape=(1536,))

In [14]:

memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True)


retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, retriever=retriever, memory=memory)

  memory = ConversationBufferMemory(


In [15]:
# Custom Prompt Template for Better Responses
# ===========================================

from langchain.prompts import PromptTemplate

# Create a custom prompt template to prevent generic "Option 1/Option 2" responses
qa_prompt = PromptTemplate(
    template="""You are my personal AI assistant. Your task is to answer questions or summarize information using only my personal documents, code projects, notes, and emails provided as context. 
Never invent information beyond what is retrieved. Always specify source filenames in your answers. 
Be brief and focused; use markdown for readability and code blocks where appropriate.
Reply in the style and language I use.
If you cannot find an answer, admit it clearly.

Context: {context}
Chat History: {chat_history}
Question: {question}

Answer:""",
    input_variables=["context", "chat_history", "question"]
)

# Update the conversation chain with the custom prompt
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": qa_prompt}
)

print("✅ Custom prompt template applied to conversation chain")

✅ Custom prompt template applied to conversation chain


In [16]:
query = "what is my work experience"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)


Answer: Here’s a concise summary of your work experience based on your documents.

- Full Stack Engineer (MERN stack)
  - Built production-grade web apps with secure RESTful APIs (Node.js/Express), JWT authentication, and CRUD operations.
  - Frontend/UI with React, EJS templating, and Tailwind CSS; deployed on Render, Vercel, and Railway.
  - Emphasizes clean, modular code and robust API testing/documentation (Postman).

  Source: Portfolio-2025

- AI, ML, and RAG specialist
  - Developed ML models (classification/regression) and NLP tasks (SER, speech processing) using TensorFlow, Keras, Scikit-learn, PyTorch.
  - Built Retrieval-Augmented Generation (RAG) pipelines with ChromaDB and LangChain; integrated OpenAI and Gemini APIs; local embeddings for personal knowledge assistants.
  - Focus on multimodal AI and dataset generation workflows.

  Sources: Portfolio-2025; llm_engineering

- Speech & NLP
  - Projects leveraging Web Speech API for voice recognition and speech synthesis; bu

In [17]:
def is_greeting(message):
    """Check if the message is a greeting"""
    greetings = [
        'hi', 'hello', 'hey', 'good morning', 'good afternoon', 'good evening',
        'greetings', 'howdy', 'sup', 'what\'s up', 'yo', 'good day',
        'hi there', 'hello there', 'hey there', 'good to see you'
    ]
    
    message_lower = message.lower().strip()
    
    # Check for exact matches
    if message_lower in greetings:
        return True
    
    # Check if message starts with greeting
    for greeting in greetings:
        if message_lower.startswith(greeting):
            return True
    
    return False

def get_greeting_response():
    """Get a friendly greeting response"""
    greetings = [
        "Hello! 👋 I'm PersonalRAG, your AI knowledge assistant. I'm here to help you find information from your documents, projects, and work experience. What would you like to know?",
        "Hi there! 🤖 Welcome to PersonalRAG. I can help you search through your knowledge base and answer questions about your work, projects, and documents. How can I assist you today?",
        "Hey! 😊 Great to see you! I'm PersonalRAG, ready to help you explore your personal knowledge base. Feel free to ask me anything about your documents or projects!",
        "Hello! 🌟 I'm PersonalRAG, your intelligent knowledge assistant. I'm here to help you discover insights from your documents and answer questions about your work experience. What can I help you with?"
    ]
    
    import random
    return random.choice(greetings)

def chat(question, history):
    """Enhanced chat function with greeting detection"""
    try:
        # Check for greetings first
        if is_greeting(question):
            return get_greeting_response()
        
        # Get response from conversation chain
        result = conversation_chain.invoke({"question": question})
        return result["answer"]
        
    except Exception as e:
        return f"❌ Sorry, I encountered an error: {str(e)}\n\nPlease try again or rephrase your question."

def chat_messages(message, history):
    """Chat function that returns messages in the correct format with greeting support"""
    try:
        # Check for greetings first
        if is_greeting(message):
            greeting_response = get_greeting_response()
            return history + [{"role": "assistant", "content": greeting_response}]
        
        # Get response from conversation chain
        result = conversation_chain.invoke({"question": message})
        response = result["answer"]
        
        # Return in the correct format for messages
        return history + [{"role": "assistant", "content": response}]
        
    except Exception as e:
        error_msg = f"❌ Sorry, I encountered an error: {str(e)}\n\nPlease try again or rephrase your question."
        return history + [{"role": "assistant", "content": error_msg}]

In [18]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
