# Overview

Build vector database for `bm3dornl`, for demo purposes.

In [1]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser, BS4HTMLParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma

In [2]:
bm3d_dir = "/Users/8cz/Github/neutronimaging_org/bm3dornl"

In [3]:
loader_pymd = GenericLoader.from_filesystem(
    bm3d_dir,
    glob="**/*",
    suffixes=[".py", ".md"],
    parser=LanguageParser(Language.PYTHON),
    show_progress=True,
)

In [4]:
documents = loader_pymd.load()

  0%|          | 0/28 [00:00<?, ?it/s]

In [5]:
# Process the documents (e.g., split text, create embeddings, etc.)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=128,)
split_documents = text_splitter.split_documents(documents)

In [6]:
ollama_emd = OllamaEmbeddings(model="nomic-embed-text")

In [7]:
# build the vector store
chroma_db = Chroma.from_documents(
    documents=split_documents,
    embedding=ollama_emd,
    persist_directory="../vectorDB/bm3dornl",
)