# AI Document Search Assistant
## Features
- Parses documents from `.txt`, `.pdf`, and `.docx`
- Converts them into embeddings
- Stores them in ChromaDB
- Queries with LangChain integration

In [20]:
# 📦 Install necessary packages
#!pip install -q langchain chromadb python-docx PyMuPDF
#!pip install -U langchain-community

In [21]:
# 📁 Document Parsing
import os
import fitz  # PyMuPDF
import docx

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def parse_documents(folder):
    docs = []
    for root, _, files in os.walk(folder):
        for file in files:
            path = os.path.join(root, file)
            ext = file.lower().split('.')[-1]
            try:
                if ext == 'txt':
                    text = read_txt(path)
                elif ext == 'pdf':
                    text = read_pdf(path)
                elif ext == 'docx':
                    text = read_docx(path)
                else:
                    continue
                docs.append({'text': text, 'path': path})
            except Exception as e:
                print(f"Failed to read {file}: {e}")
    return docs



In [22]:
# 🧠 Embedding Generation and Storage with ChromaDB
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
# Load environment variables
import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

def store_embeddings(docs, persist_directory="VectorDB"):

    #create an list of texts 
    texts = [doc['text'] for doc in docs]
    
    #add path to the metadata
    metadatas = [{"source": doc['path']} for doc in docs]

    # can create the embeddings using openAI or Chromedb hugginFace
    embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

    vectordb = Chroma.from_texts(texts, embeddings, metadatas=metadatas, persist_directory=persist_directory)
    vectordb.persist()
    return vectordb

In [23]:
# 🔍 Querying with LangChain
def query_db(query, persist_directory="VectorDB"):
    embeddings = OpenAIEmbeddings()
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    results = vectordb.similarity_search(query)
    return results

## ✅ Example Usage
```python
# Parse documents from a folder
docs = parse_documents("docs")
# Store them in vector DB
store_embeddings(docs)
# Query the assistant
results = query_db("project report on AI governance")
for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")
```

# Main Program

In [24]:
# Parse documents from a folder
docs = parse_documents("Docs")
# Store them in vector DB
store_embeddings(docs)
# Query the assistant
results = query_db("Maitenance in general")
for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")

APIConnectionError: Connection error.