# Overview

This notebook is intended to build a Chroma database for working on the Mantid project.

In [1]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser, BS4HTMLParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from pprint import pprint
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma

In [2]:
mantid_dir = "/Users/8cz/Github/mantid/Framework"
mantiddoc_dir = "/Users/8cz/Github/mantid/build/docs"
mantiddevdoc_dir = "/Users/8cz/Github/mantid/build/dev-docs"

In [3]:
loader_cxx = GenericLoader.from_filesystem(
    mantid_dir,
    glob="**/*",
    suffixes=[".cpp", ".h", ".c", ".hpp", ".cxx", ".hxx", ".cc", ".hh"],
    parser=LanguageParser(Language.CPP),
    show_progress=True,
)

loader_py = GenericLoader.from_filesystem(
    mantid_dir,
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(Language.PYTHON),
    show_progress=True,
)

In [4]:
loader_doc = GenericLoader.from_filesystem(
    mantiddoc_dir,
    glob="**/*",
    suffixes=[".html"],
    parser=BS4HTMLParser(),
    show_progress=True,
)

In [5]:
loader_devdoc = GenericLoader.from_filesystem(
    mantiddevdoc_dir,
    glob="**/*",
    suffixes=[".html"],
    parser=BS4HTMLParser(),
    show_progress=True,
)

In [6]:
docs_cxx = loader_cxx.load()

  0%|          | 0/5477 [00:00<?, ?it/s]



In [7]:
docs_py = loader_py.load()

  0%|          | 0/967 [00:00<?, ?it/s]

In [8]:
docs_doc = loader_doc.load()

  0%|          | 0/3246 [00:00<?, ?it/s]

In [9]:
docs_devdoc = loader_devdoc.load()

  0%|          | 0/160 [00:00<?, ?it/s]

In [10]:
splitter_cxx = RecursiveCharacterTextSplitter.from_language(
    language=Language.CPP,
    chunk_size=4096,
    chunk_overlap=128,
)

In [11]:
splitter_py = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=4096,
    chunk_overlap=128,
)

In [12]:
splitter_doc = RecursiveCharacterTextSplitter.from_language(
    language=Language.HTML,
    chunk_size=4096,
    chunk_overlap=128,
)

In [13]:
splitter_devdoc = RecursiveCharacterTextSplitter.from_language(
    language=Language.HTML,
    chunk_size=4096,
    chunk_overlap=128,
)

In [14]:
splits_cxx = splitter_cxx.split_documents(docs_cxx)
splits_py = splitter_py.split_documents(docs_py)
splits_doc = splitter_doc.split_documents(docs_doc)
splits_devdoc = splitter_devdoc.split_documents(docs_devdoc)

In [15]:
# print first 5 splits in each language
print("C++ splits:")
pprint(splits_cxx[:5])
print("Python splits:")
pprint(splits_py[:5])
print("HTML doc splits:")
pprint(splits_doc[:5])
print("HTML devdoc splits:")
pprint(splits_devdoc[:5])

C++ splits:
[Document(page_content='// Mantid Repository : https://github.com/mantidproject/mantid\n//\n// Copyright &copy; 2011 ISIS Rutherford Appleton Laboratory UKRI,\n//   NScD Oak Ridge National Laboratory, European Spallation Source,\n//   Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS\n// SPDX - License - Identifier: GPL - 3.0 +\n#pragma once\n\n//----------------------------------------------------------------------\n// Includes\n//----------------------------------------------------------------------\n#include "MantidAPI/Algorithm.h"\n\nnamespace Mantid {\nnamespace WorkflowAlgorithms {\n/**\n    Calculate the detector sensitivity and patch the pixels that are masked in a\n   second workspace.\n*/\nclass DLLExport EQSANSPatchSensitivity final : public API::Algorithm {\npublic:\n  /// Algorithm\'s name\n  const std::string name() const override { return "EQSANSPatchSensitivity"; }\n  /// Summary of algorithms purpose\n  const std::string summary() const

time to build the database

In [16]:
ollama_emd = OllamaEmbeddings(model="nomic-embed-text")

In [17]:
# build the database
chroma_db = Chroma.from_documents(
    documents=docs_cxx + docs_py + docs_doc + docs_devdoc,
    embedding=ollama_emd,
    persist_directory="../vectorDB/mantid",
)