In [None]:
import os
import uuid

import chromadb
from chromadb.config import Settings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.confluence import ConfluenceLoader

from modules.indexing import load_docs, split_documents

# Environment

In [None]:
# set env vars for confluence wiki
CONFLUENCE_PAT = os.getenv("CONFLUENCE_PAT")
CONFLUENCE_SPACE_KEY = os.getenv("CONFLUENCE_SPACE_KEY")
CONFLUENCE_URL = os.getenv("CONFLUENCE_URL")

In [None]:
# set env vars for indexing
CHUNK_SIZE = 800
CHUNK_OVERLAP = 160
# collection names in chroma will be based on the chunk size
# thus you can experiment retrieving chunks of differnt size
COLLECTION_NAME = f"{CHUNK_SIZE}_{CONFLUENCE_SPACE_KEY}"

# Loading and splitting documents

In [None]:
# initialize Confluence document loader and 
# load documents from Confluence Wiki
loader = ConfluenceLoader(
    url=CONFLUENCE_URL,
    token=CONFLUENCE_PAT,
    cloud=False,
    space_key=CONFLUENCE_SPACE_KEY,
    include_attachments=False,
)
docs = load_docs(loader)

In [None]:
# initialize splitter and split docs into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)
chunks = split_documents(splitter, docs)

In [None]:
#  a client that connects to a local chromadb server
chroma_settings = Settings(allow_reset=True)
chroma_client = chromadb.HttpClient(
    settings=chroma_settings,
)

# Generate and save embeddings in ChromaDB

Choose one of the options for creating embeddings.

## AWS Bedrock

In [None]:
from langchain_community.embeddings.bedrock import BedrockEmbeddings
embeddings_function = BedrockEmbeddings(
    credentials_profile_name=os.getenv("AWS_CREDENTIALS_PROFILE_NAME"),
    region_name=os.getenv("AWS_REGION_NAME", "eu-central-1"),
    model_id=os.getenv("AWS_EMBEDDING_MODEL_ID", "amazon.titan-text-express-v1")
)

## Ollama embeddings

- [Blog post about embedding models (by ollama)](https://ollama.com/blog/embedding-models)
- [Ollama embedding model (langchain docs)](https://python.langchain.com/docs/integrations/text_embedding/ollama/)

In [None]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "mxbai-embed-large")
embeddings_function = OllamaEmbeddings(model=OLLAMA_EMBEDDING_MODEL)

In [None]:
collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)
if collection.count() <= 0:
    # store each document in a vector embedding database
    for d in chunks:
        response = embeddings_function.embed_query(d.page_content)
        collection.add(
            ids=[str(uuid.uuid1())],
            embeddings=[response],
            documents=[d.page_content],
            metadatas=[d.metadata]
        )