# AI Document Search Assistant
## Features
- Parses documents from `.txt`, `.pdf`, and `.docx`
- Converts them into embeddings
- Stores them in ChromaDB
- Queries with LangChain integration

In [97]:
import pkg_resources

# 📦 Install necessary packages
#!pip install -q langchain chromadb python-docx PyMuPDF
#!pip install -U langchain-community
#!pip install langchain-openai
#!pip install sentence-transformers
#!pip install ipywidgets tf-keras
#!pip install chromadb langchain-chroma
#!pip install langchain-huggingface


installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}

packages_to_check = [
    "langchain",
    "chromadb",
    "python-docx",
    "PyMuPDF",
    "langchain-community",
    "langchain-openai",
    "sentence-transformers",
    "ipywidgets",
    "tf-keras",
    "langchain-chroma",
    "langchain-huggingface"
]

for package in packages_to_check:
    version = installed_packages.get(package, "Not Installed")
    print(f"{package}: {version}")

langchain: 0.3.24
chromadb: 0.6.3
python-docx: 1.1.2
PyMuPDF: Not Installed
langchain-community: 0.3.22
langchain-openai: 0.3.14
sentence-transformers: 4.1.0
ipywidgets: 8.1.6
tf-keras: 2.19.0
langchain-chroma: 0.2.3
langchain-huggingface: 0.1.2


In [92]:
from langchain_openai import OpenAIEmbeddings

try:
        embeddings = OpenAIEmbeddings( model="text-embedding-3-large")
        result = embeddings.embed_query("Test query")
        print(result)
except Exception as e:
        print(f"Error: {e}")

Error: Connection error.


In [93]:
# 📁 Document Parsing
import os
import fitz  # PyMuPDF
import docx

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def parse_documents(folder):
    docs = []
    for root, _, files in os.walk(folder):
        for file in files:
            path = os.path.join(root, file)
            ext = file.lower().split('.')[-1]
            try:
                if ext == 'txt':
                    text = read_txt(path)
                elif ext == 'pdf':
                    text = read_pdf(path)
                elif ext == 'docx':
                    text = read_docx(path)
                else:
                    continue
                docs.append({'text': text, 'path': path})
            except Exception as e:
                print(f"Failed to read {file}: {e}")
    return docs




In [94]:
# 🧠 Embedding Generation and Storage with ChromaDB
from langchain_community.vectorstores import Chroma

from langchain_openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document


# Load environment variables
import os
from dotenv import load_dotenv

# Load the .env file
# Set the API key globally
load_dotenv()
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
OpenAIEmbeddings.api_key = OPENAI_API_KEY


def store_embeddings(docs, persist_directory="VectorDB"):

    vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=SentenceTransformer("all-MiniLM-L6-v2"),
    persist_directory=persist_directory,
    )

    #create an list of texts 
    texts = [doc['text'] for doc in docs]
    
    #add path to the metadata
    metadatas = [{"source": doc['path']} for doc in docs]

    # Generate unique IDs for each document
    ids = [f"doc_{i}" for i in range(len(texts))]

    # can create the embeddings using openAI or Chromedb hugginFace
    #embeddings = OpenAIEmbeddings().embed_documents(texts)
    embeddings = SentenceTransformer("all-MiniLM-L6-v2").encode(texts)

    print(type(embeddings))
    
    # Add documents with precomputed embeddings
    vector_store._collection.upsert(
        documents=texts,
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids,
    )
    
    return vector_store

In [95]:
# 🔍 Querying with LangChain
def query_db(query, db):

    results = db.similarity_search(query)
    return results

# Main Program

In [96]:
# Parse documents from a folder
docs = parse_documents("Docs")
# Store them in vector DB
db = store_embeddings(docs)
# Query the assistant
results = query_db("Maitenance in general", db)
for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")

No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.


ValueError: Unrecognized model in sentence-transformers/all-MiniLM-L6-v2. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v3, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, git, glm, glm4, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mistral3, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam_vision_model, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth