# AI Document Search Assistant
## Features
- Parses documents from `.txt`, `.pdf`, and `.docx`
- Converts them into embeddings
- Stores them in ChromaDB
- Queries with LangChain integration

In [84]:
# 📦 Install necessary packages
#!pip install -q langchain chromadb python-docx PyMuPDF
#!pip install -U langchain-community
#!pip install langchain-openai
#!pip install sentence-transformers
#!pip install ipywidgets tf-keras
#!pip install chromadb langchain-chroma


In [85]:
from langchain_openai import OpenAIEmbeddings

try:
        embeddings = OpenAIEmbeddings( model="text-embedding-3-large")
        result = embeddings.embed_query("Test query")
        print(result)
except Exception as e:
        print(f"Error: {e}")

Error: Connection error.


In [86]:
# 📁 Document Parsing
import os
import fitz  # PyMuPDF
import docx

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def parse_documents(folder):
    docs = []
    for root, _, files in os.walk(folder):
        for file in files:
            path = os.path.join(root, file)
            ext = file.lower().split('.')[-1]
            try:
                if ext == 'txt':
                    text = read_txt(path)
                elif ext == 'pdf':
                    text = read_pdf(path)
                elif ext == 'docx':
                    text = read_docx(path)
                else:
                    continue
                docs.append({'text': text, 'path': path})
            except Exception as e:
                print(f"Failed to read {file}: {e}")
    return docs



In [87]:
# 🧠 Embedding Generation and Storage with ChromaDB
from langchain_community.vectorstores import Chroma

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document


# Load environment variables
import os
from dotenv import load_dotenv

# Load the .env file
# Set the API key globally
load_dotenv()
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
OpenAIEmbeddings.api_key = OPENAI_API_KEY


def store_embeddings(docs, persist_directory="VectorDB"):

    vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=HuggingFaceEmbeddings(model_name="aria"),
    persist_directory=persist_directory,
    )

    #create an list of texts 
    texts = [doc['text'] for doc in docs]
    
    #add path to the metadata
    metadatas = [{"source": doc['path']} for doc in docs]

    # Generate unique IDs for each document
    ids = [f"doc_{i}" for i in range(len(texts))]

    # can create the embeddings using openAI or Chromedb hugginFace
    #embeddings = OpenAIEmbeddings().embed_documents(texts)
    embeddings = HuggingFaceEmbeddings(model_name="aria").embed_documents(texts)

    # Add documents with precomputed embeddings
    vector_store._collection.upsert(
        documents=texts,
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids,
    )
    
    return vector_store

In [88]:
# 🔍 Querying with LangChain
def query_db(query, persist_directory="VectorDB"):

    #chat gpt model
    #embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceEmbeddings(model_name="aria")
    
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    results = vectordb.similarity_search(query)
    return results

# Main Program

In [89]:
# Parse documents from a folder
docs = parse_documents("Docs")
# Store them in vector DB
store_embeddings(docs)
# Query the assistant
results = query_db("Maitenance in general")
for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")

No sentence-transformers model found with name sentence-transformers/aria. Creating a new one with mean pooling.


SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/aria/resolve/main/tf_model.h5 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:992)')))"), '(Request ID: ea7734a5-e5fc-4802-9a93-931c0fc77db7)')