In [None]:
import logging
from pathlib import Path
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.text_splitter import RecursiveCharacterTextSplitter

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def initialize_literature_db():
    """
    Initialize the Chroma vector store using LangChain.
    """
    logger.info("Initializing LangChain Chroma vector store...")

    #persist_directory = None
    persist_directory = self.config.chroma_persist_directory
    embedding_function = SentenceTransformerEmbeddings(model_name=self.config.embedding_model)

    # Check if a database already exists and we are not forcing a rebuild
    if Path(persist_directory).exists() and not getattr(self, 'force_rebuild', False):
        logger.info(f"Loading existing vector store from: {persist_directory}")
        self.vector_store = Chroma(
            persist_directory=persist_directory,
            embedding_function=embedding_function
        )
        logger.info("Vector store loaded successfully.")
        return

    # If we are rebuilding, clear the directory first
    if getattr(self, 'force_rebuild', False) and Path(persist_directory).exists():
        import shutil
        logger.info(f"Rebuilding database, clearing directory: {persist_directory}")
        shutil.rmtree(persist_directory)

    # If the database does not exist or a rebuild is forced, create it
    logger.info("No existing database found or rebuild forced. Creating new vector store...")

    # 1. Load all literature documents into memory
    all_docs_content = []
    for lit_path in tqdm(self.literature_paths, desc="Loading literature files"):
        content = self._load_literature_file(lit_path)
        if content:
            # We add metadata here to preserve the source
            from langchain_core.documents import Document
            doc = Document(page_content=content, metadata={"source": lit_path})
            all_docs_content.append(doc)

    if not all_docs_content:
        logger.error("No literature content could be loaded. Aborting vector store creation.")
        return

    # 2. Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=self.config.chunk_size,
        chunk_overlap=self.config.chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    chunks = text_splitter.split_documents(all_docs_content)
    logger.info(f"Split {len(all_docs_content)} documents into {len(chunks)} chunks.")

    # 3. Create the Chroma vector store from the chunks
    logger.info("Creating embeddings and persisting the vector store. This may take a while...")
    self.vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_function,
        persist_directory=persist_directory
    )
    logger.info("Successfully created and persisted the vector store.")