# Overview

This notebook is used to build a Chroma database for working with iMars3D library.

In [1]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser, BS4HTMLParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma

In [2]:
imars3d_dir = "/Users/8cz/Github/ornlneutronimaging_org/iMars3D/src/imars3d"
imars3ddoc_dir = "/Users/8cz/Github/ornlneutronimaging_org/iMars3D/docs/_build/html"

In [3]:
loader_py = GenericLoader.from_filesystem(
    imars3d_dir,
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(Language.PYTHON),
    show_progress=True,
)

loader_doc = GenericLoader.from_filesystem(
    imars3ddoc_dir,
    glob="**/*",
    suffixes=[".html"],
    parser=BS4HTMLParser(),
    show_progress=True,
)

In [4]:
docs_py = loader_py.load()
docs_doc = loader_doc.load()

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

In [5]:
splitter_py = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=4096,
    chunk_overlap=128,
)

splitter_doc = RecursiveCharacterTextSplitter.from_language(
    language=Language.HTML,
    chunk_size=4096,
    chunk_overlap=128,
)

In [6]:
splits_py = splitter_py.split_documents(docs_py)

splits_doc = splitter_doc.split_documents(docs_doc)

In [7]:
ollama_emd = OllamaEmbeddings(model="nomic-embed-text")

In [8]:
# build the vector store
chroma_db = Chroma.from_documents(
    documents=splits_py + splits_doc,
    embedding=ollama_emd,
    persist_directory="../vectorDB/imars3d",
)