# Overview

This notebook is used to build a Chroma database for working with iMars3D library.

In [1]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from pprint import pprint
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma

In [2]:
imars3d_dir = "/Users/8cz/Github/ornlneutronimaging_org/iMars3D/src/imars3d"

In [3]:
loader_py = GenericLoader.from_filesystem(
    imars3d_dir,
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(Language.PYTHON),
)

In [4]:
docs_py = loader_py.load()

In [5]:
pprint(docs_py[:5])

[Document(page_content='__version__ = "1.1.0.dev37+d202403081945"', metadata={'source': '/Users/8cz/Github/ornlneutronimaging_org/iMars3D/src/imars3d/_version.py', 'content_type': 'simplified_code', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='"""iMars3D: a Python package for neutron imaging and tomography reconstruction."""\n\nimport logging\nfrom .backend import corrections, diagnostics, dataio, morph, preparation, reconstruction  # noqa: F401\n\nlogging.getLogger("imars3d").setLevel(logging.INFO)\ntry:\n    from ._version import __version__  # noqa: F401\nexcept ImportError:\n    __version__ = "unknown"', metadata={'source': '/Users/8cz/Github/ornlneutronimaging_org/iMars3D/src/imars3d/__init__.py', 'content_type': 'simplified_code', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='class BaseWindow(pn.viewable.Viewer):\n    """Base class for all viewer."""\n\n    # configuration\n    config_dict = param.Dict(\n        default={\n            "f

In [6]:
splitter_py = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=1024,
    chunk_overlap=128,
)

In [7]:
splits_py = splitter_py.split_documents(docs_py)

In [8]:
pprint(splits_py[:5])

[Document(page_content='__version__ = "1.1.0.dev37+d202403081945"', metadata={'source': '/Users/8cz/Github/ornlneutronimaging_org/iMars3D/src/imars3d/_version.py', 'content_type': 'simplified_code', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='"""iMars3D: a Python package for neutron imaging and tomography reconstruction."""\n\nimport logging\nfrom .backend import corrections, diagnostics, dataio, morph, preparation, reconstruction  # noqa: F401\n\nlogging.getLogger("imars3d").setLevel(logging.INFO)\ntry:\n    from ._version import __version__  # noqa: F401\nexcept ImportError:\n    __version__ = "unknown"', metadata={'source': '/Users/8cz/Github/ornlneutronimaging_org/iMars3D/src/imars3d/__init__.py', 'content_type': 'simplified_code', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='class BaseWindow(pn.viewable.Viewer):\n    """Base class for all viewer."""\n\n    # configuration\n    config_dict = param.Dict(\n        default={\n            "f

In [9]:
ollama_emd = OllamaEmbeddings()

In [10]:
# build the vector store
chroma_db = Chroma.from_documents(
    documents=splits_py,
    embedding=ollama_emd,
    persist_directory="../vectorDB/imars3d",
)