In [1]:
import os
import uuid

import chromadb
from chromadb.config import Settings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.confluence import ConfluenceLoader

from modules.indexing import load_docs, split_documents

# Environment

In [2]:
# set env vars for confluence wiki
CONFLUENCE_PAT = os.getenv("CONFLUENCE_PAT")
CONFLUENCE_SPACE_KEY = os.getenv("CONFLUENCE_SPACE_KEY")
CONFLUENCE_URL = os.getenv("CONFLUENCE_URL")

In [3]:
# set env vars for indexing
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 32
# collection names in chroma will be based on the chunk size
# thus you can experiment retrieving chunks of differnt size
COLLECTION_NAME = f"{CHUNK_SIZE}_{CONFLUENCE_SPACE_KEY}"

# Loading and splitting documents

In [4]:
# initialize Confluence document loader and 
# load documents from Confluence Wiki
loader = ConfluenceLoader(
    url=CONFLUENCE_URL,
    token=CONFLUENCE_PAT,
    cloud=False,
    space_key=CONFLUENCE_SPACE_KEY,
    include_attachments=False,
)
docs = load_docs(loader)

### Optional: save a local copy of documents

In [5]:
from typing import TypedDict
import textwrap

class Metadata(TypedDict):
    title: str
    id: str
    source: str
    when: str

def write_page_to_file(content: str, metadata: Metadata) -> None:
    # Prepare the directory
    base_directory = 'data/docu/ITI-CS'
    
    # Extract date from the 'when' key
    date_part = metadata['when'].split('T')[0]  # Extract the date from the 'when' key
    
    # Format the filename by sanitizing the title to remove any special characters that could form paths
    safe_title = metadata['title'].replace('/', '_').replace('\\', '_')
    
    # Format the filename based on title, id, and date
    file_name = f"{safe_title}_{metadata['id']}_{date_part}.txt"
    
    # Ensure the directory exists
    os.makedirs(base_directory, exist_ok=True)
    
    # Full path to the file
    full_path = os.path.join(base_directory, file_name)
    
    # Wrap content to 80 characters wide
    wrapped_content = textwrap.fill(content, width=80)
    
    # Write the content to the file
    with open(full_path, 'w') as file:
        file.write(wrapped_content)

for doc in docs:
    write_page_to_file(content=doc.page_content, metadata=doc.metadata)

## Option 1: recursive character text splitting

In [6]:
# initialize splitter and split docs into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)
chunks = split_documents(splitter, docs)

Split 291 documents into 845 chunks.


## Initialize client for ChromaDB

In [7]:
#  a client that connects to a local chromadb server
chroma_settings = Settings(allow_reset=True)
chroma_client = chromadb.HttpClient(
    settings=chroma_settings,
)

# Generate and save embeddings in ChromaDB

Choose one of the options for creating embeddings.

## AWS Bedrock (embeddings)

In [8]:
from langchain_community.embeddings.bedrock import BedrockEmbeddings
embeddings_function = BedrockEmbeddings(
    credentials_profile_name=os.getenv("AWS_CREDENTIALS_PROFILE_NAME"),
    region_name=os.getenv("AWS_REGION_NAME", "eu-central-1"),
    model_id=os.getenv("AWS_EMBEDDING_MODEL_ID", "amazon.titan-text-express-v1")
)

## Ollama embeddings

- [Blog post about embedding models (by ollama)](https://ollama.com/blog/embedding-models)
- [Ollama embedding model (langchain docs)](https://python.langchain.com/docs/integrations/text_embedding/ollama/)

In [None]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "mxbai-embed-large")
embeddings_function = OllamaEmbeddings(model=OLLAMA_EMBEDDING_MODEL)

## Actually generate and save embeddings

In [9]:
collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)
if collection.count() <= 0:
    # store each document in a vector embedding database
    for d in chunks:
        response = embeddings_function.embed_query(d.page_content)
        collection.add(
            ids=[str(uuid.uuid1())],
            embeddings=[response],
            documents=[d.page_content],
            metadatas=[d.metadata]
        )