# Overview

This notebook is intended to build a Chroma database for working on the Mantid project.

In [1]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from pprint import pprint
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma

In [2]:
mantid_dir = "/Users/8cz/Github/mantid/Framework"
mantiddoc_dir = "/Users/8cz/Github/mantid/docs/source"

In [3]:
loader_cxx = GenericLoader.from_filesystem(
    mantid_dir,
    glob="**/*",
    suffixes=[".cpp", ".h", ".c", ".hpp", ".cxx", ".hxx", ".cc", ".hh"],
    parser=LanguageParser(Language.CPP),
)

loader_py = GenericLoader.from_filesystem(
    mantid_dir,
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(Language.PYTHON),
)

In [4]:
docs_cxx = loader_cxx.load()

print("C++ documents:")
pprint(docs_cxx[:5])



C++ documents:
[Document(page_content='// Mantid Repository : https://github.com/mantidproject/mantid\n//\n// Copyright &copy; 2011 ISIS Rutherford Appleton Laboratory UKRI,\n//   NScD Oak Ridge National Laboratory, European Spallation Source,\n//   Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS\n// SPDX - License - Identifier: GPL - 3.0 +\n#pragma once\n\n//----------------------------------------------------------------------\n// Includes\n//----------------------------------------------------------------------\n#include "MantidAPI/Algorithm.h"\n\nnamespace Mantid {\nnamespace WorkflowAlgorithms {\n/**\n    Calculate the detector sensitivity and patch the pixels that are masked in a\n   second workspace.\n*/\nclass DLLExport EQSANSPatchSensitivity final : public API::Algorithm {\npublic:\n  /// Algorithm\'s name\n  const std::string name() const override { return "EQSANSPatchSensitivity"; }\n  /// Summary of algorithms purpose\n  const std::string summary() co

In [5]:
docs_py = loader_py.load()

print("Python documents:")
pprint(docs_py[:5])

Python documents:
[Document(page_content='class EQSANSQ2DTest(unittest.TestCase):\n    def setUp(self):\n\n        self.test_ws_name = "EQSANS_test_ws"\n        x = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]\n        y = 491520 * [0.1]\n        CreateWorkspace(OutputWorkspace=self.test_ws_name, DataX=x, DataY=y, DataE=y, NSpec="49152", UnitX="Wavelength")\n        LoadInstrument(Workspace=self.test_ws_name, InstrumentName="EQSANS", RewriteSpectraMap=True)\n\n        run = mtd[self.test_ws_name].mutableRun()\n\n        run.addProperty("sample_detector_distance", 4000.0, "mm", True)\n        run.addProperty("beam_center_x", 96.0, "pixel", True)\n        run.addProperty("beam_center_y", 128.0, "pixel", True)\n        run.addProperty("wavelength_min", 1.0, "Angstrom", True)\n        run.addProperty("wavelength_max", 11.0, "Angstrom", True)\n        run.addProperty("is_frame_skipping", 0, True)\n        run.addProperty("wavelength_min_frame2", 5.0, "Angstrom", True)\n        

In [6]:
splitter_cxx = RecursiveCharacterTextSplitter.from_language(
    language=Language.CPP,
    chunk_size=1024,
    chunk_overlap=128,
)

In [7]:
splitter_py = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=1024,
    chunk_overlap=128,
)

In [8]:
splits_cxx = splitter_cxx.split_documents(docs_cxx)
splits_py = splitter_py.split_documents(docs_py)

In [9]:
# print first 5 splits in each language
print("C++ splits:")
pprint(splits_cxx[:5])
print("Python splits:")
pprint(splits_py[:5])

C++ splits:
[Document(page_content='// Mantid Repository : https://github.com/mantidproject/mantid\n//\n// Copyright &copy; 2011 ISIS Rutherford Appleton Laboratory UKRI,\n//   NScD Oak Ridge National Laboratory, European Spallation Source,\n//   Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS\n// SPDX - License - Identifier: GPL - 3.0 +\n#pragma once\n\n//----------------------------------------------------------------------\n// Includes\n//----------------------------------------------------------------------\n#include "MantidAPI/Algorithm.h"\n\nnamespace Mantid {\nnamespace WorkflowAlgorithms {\n/**\n    Calculate the detector sensitivity and patch the pixels that are masked in a\n   second workspace.\n*/', metadata={'source': '/Users/8cz/Github/mantid/Framework/WorkflowAlgorithms/inc/MantidWorkflowAlgorithms/EQSANSPatchSensitivity.h'}),
 Document(page_content='class DLLExport EQSANSPatchSensitivity final : public API::Algorithm {\npublic:\n  /// Algorithm\'s 

time to build the database

In [10]:
ollama_emd = OllamaEmbeddings()

In [11]:
# build the database
chroma_db = Chroma.from_documents(
    documents=docs_cxx + docs_py,
    embedding=ollama_emd,
    persist_directory="../vectorDB/mantid",
)