### reset DB 

In [27]:
import shutil
import os

db_path = "VectorDB"
if os.path.exists(db_path):
    shutil.rmtree(db_path)
    print("✅ VectorDB reset.")
    os.mkdir(db_path)
    print("✅ VectorDB folder created.")
else:
    print("ℹ️ No VectorDB directory found.")
    os.mkdir(db_path)
    print("✅ VectorDB folder created.")

ℹ️ No VectorDB directory found.
✅ VectorDB folder created.


In [38]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
# Load environment variables
import os
from dotenv import load_dotenv
import os
import fitz  # PyMuPDF
import docx

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


docs = []
for root, _, files in os.walk("Docs"):
    for file in files:
        path = os.path.join(root, file)
        ext = file.lower().split('.')[-1]
        try:
            if ext == 'txt':
                text = read_txt(path)
            elif ext == 'pdf':
                text = read_pdf(path)
            elif ext == 'docx':
                text = read_docx(path)
            else:
                continue
            docs.append({'text': text, 'path': path})
        except Exception as e:
            print(f"Failed to read {file}: {e}")

# chose the embedding function used 
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=embedding_function,
    persist_directory="VectorDataBase",
    )

#create an list of texts 
texts = [doc['text'] for doc in docs]
#add path to the metadata
metadatas = [{"source": doc['path']} for doc in docs]
# Generate unique IDs for each document
ids = [f"doc_{i}" for i in range(len(texts))]


print(f"Loaded {len(docs)} documents")  # 👈 confirm docs

# Add texts to the vector store with metadata and IDs
vector_store.add_texts(texts=texts, metadatas=metadatas , ids=ids)

print(f"Collection count: {vector_store._collection.count()}")


results = vector_store.similarity_search("maintenance procedures", k=3)
print(f"Found {len(results)} results")

for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")



#results = vectordb.similarity_search("Maitenance in general")
#for r in results:
#    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")


Loaded 5 documents
Collection count: 5
Found 3 results
Docs/maintenance_guide.txt 
 This document provides detailed maintenance procedures for industrial machines.
Always perform regular maintenance checks every 30 days.
Replace any damaged parts immediately and report issues to the  ...

Docs/equipment_checks.txt 
 All equipment must undergo a pre-use check to ensure safety and reliability.
Checklist items include: power status, loose parts, and noise levels during operation.
Maintenance logs must be updated aft ...

Docs/safety_protocols.txt 
 Operators must follow strict safety protocols during machine operation and maintenance.
Wear safety gloves, goggles, and secure all clothing.
Emergency shutdown procedures are listed in section 4 of t ...



# Test

In [35]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=embedding_function,
    persist_directory="VectorDB"
)

texts = [doc['text'] for doc in docs]
metadatas = [{"source": doc['path']} for doc in docs]

# No need to precompute embeddings — let Chroma do it
vector_store.add_texts(texts=texts, metadatas=metadatas)

# Re-initialize for search
vectordb = Chroma(persist_directory="VectorDB", embedding_function=embedding_function)

results = vectordb.similarity_search("Maitenance in general")
for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")


OperationalError: attempt to write a readonly database

In [34]:
import os

db_file = "VectorDB/chroma.sqlite3"
if os.path.exists(db_file):
    print("✔ DB file found.")
    print(f"Writable? {'Yes' if os.access(db_file, os.W_OK) else '❌ No'}")
else:
    print("❌ DB file not found.")


❌ DB file not found.
