# AI Document Search Assistant
## Features
- Parses documents from `.txt`, `.pdf`, and `.docx`
- Converts them into embeddings
- Stores them in ChromaDB
- Queries with LangChain integration

In [None]:
import pkg_resources

# 📦 Install necessary packages
#!pip install -q langchain chromadb python-docx PyMuPDF
#!pip install -U langchain-community
#!pip install langchain-openai
#!pip install sentence-transformers
#!pip install ipywidgets tf-keras
#!pip install chromadb langchain-chroma
#!pip install langchain-huggingface
#!pip install langchain_experimental
#!pip install -U langchain-google-genai
!pip uninstall protobuf
!pip install protobuf==6.31



installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}

packages_to_check = [
    "langchain",
    "chromadb",
    "python-docx",
    "PyMuPDF",
    "langchain-community",
    "langchain-openai",
    "sentence-transformers",
    "ipywidgets",
    "tf-keras",
    "langchain-chroma",
    "langchain-huggingface",
    "protobuf"
]

for package in packages_to_check:
    version = installed_packages.get(package, "Not Installed")
    print(f"{package}: {version}")

^C
langchain: 0.3.24
chromadb: 0.6.3
python-docx: 1.1.2
PyMuPDF: Not Installed
langchain-community: 0.3.22
langchain-openai: 0.3.14
sentence-transformers: 4.1.0
ipywidgets: 8.1.6
tf-keras: 2.19.0
langchain-chroma: 0.2.3
langchain-huggingface: 0.1.2
protobuf: 5.29.4


### Document Parsing

In [71]:
# 📁 Document Parsing
import os
import fitz  # PyMuPDF
import docx

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def parse_documents(folder):
    docs = []
    for root, _, files in os.walk(folder):
        for file in files:
            path = os.path.join(root, file)
            ext = file.lower().split('.')[-1]
            try:
                if ext == 'txt':
                    text = read_txt(path)
                elif ext == 'pdf':
                    text = read_pdf(path)
                elif ext == 'docx':
                    text = read_docx(path)
                else:
                    continue
                docs.append({'text': text, 'path': path})
            except Exception as e:
                print(f"Failed to read {file}: {e}")
    return docs




### Main Functions

In [72]:
# 🧠 Embedding Generation and Storage with ChromaDB
from langchain_community.vectorstores import Chroma

from langchain_openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document


# Load environment variables
import os
from dotenv import load_dotenv

# Load the .env file
# Set the API key globally
load_dotenv()
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
OpenAIEmbeddings.api_key = OPENAI_API_KEY


def store_embeddings(docs, persist_directory="VectorDB"):

    vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=SentenceTransformer("all-MiniLM-L6-v2"),
    #persist_directory=persist_directory,
    )

    #create an list of texts 
    texts = [doc['text'] for doc in docs]
    
    #add path to the metadata
    metadatas = [{"source": doc['path']} for doc in docs]

    # Generate unique IDs for each document
    ids = [f"doc_{i}" for i in range(len(texts))]

    # can create the embeddings using openAI or Chromedb hugginFace
    #embeddings = OpenAIEmbeddings().embed_documents(texts)
    embeddings = SentenceTransformer("all-MiniLM-L6-v2").encode(texts)

    # creating embeddings using huggingface and chunking them 


    print(type(embeddings))
    
    # Add documents with precomputed embeddings
    
    vector_store.add_documents(
        documents=[Document(page_content=text, metadata=metadata) for text, metadata in zip(texts, metadatas)],
        embeddings=embeddings,
        ids=ids
    )
    return vector_store

In [73]:
# 🔍 Querying with LangChain
def query_db(query, db):

    results = db.similarity_search(query)
    return results

# Main Program

In [74]:
# Parse documents from a folder
docs = parse_documents("Docs")
# Store them in vector DB
db = store_embeddings(docs)
# Query the assistant
results = query_db("What fruits and vegies give most energy", db)
for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")

<class 'numpy.ndarray'>


AttributeError: 'SentenceTransformer' object has no attribute 'embed_documents'