# Basic indexing

Here, we split the confluence documents recursively by character into smaller pieces (chunks), embed the resulting chunks and store the embeddings in a Chroma database. Thus we can later find chunks similar to the query and provide them to an LLM as context. This is the most basic approach to document splitting.

In [None]:
import os

import chromadb
from chromadb.config import Settings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.confluence import ConfluenceLoader

from modules.indexing import load_docs, split_documents

## Environment

Provide `PYTHONPATH` (and other environment variables) in a `.env` file.


In [None]:
import dotenv

dotenv.load_dotenv()

In [None]:
# set env vars for confluence wiki
CONFLUENCE_PAT = os.getenv("CONFLUENCE_PAT")
CONFLUENCE_SPACE_KEY = os.getenv("CONFLUENCE_SPACE_KEY")
CONFLUENCE_URL = os.getenv("CONFLUENCE_URL")

In [None]:
# set env vars for indexing
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 128
# collection names in chroma will be based on the chunk size
# thus you can experiment retrieving chunks of differnt size
COLLECTION_NAME = f"{CHUNK_SIZE}_{CONFLUENCE_SPACE_KEY}"

## Loading documents from confluence wiki

In [None]:
# initialize Confluence document loader and 
# load documents from Confluence Wiki
loader = ConfluenceLoader(
    url=CONFLUENCE_URL,
    token=CONFLUENCE_PAT,
    cloud=False,
    space_key=CONFLUENCE_SPACE_KEY,
    include_attachments=False,
)
docs = load_docs(loader)

### Optional: save a local copy of documents

In [None]:
from typing import TypedDict
import textwrap

class Metadata(TypedDict):
    title: str
    id: str
    source: str
    when: str

def write_page_to_file(content: str, metadata: Metadata) -> None:
    # Prepare the directory
    base_directory = f'../data/docu/{CONFLUENCE_SPACE_KEY}'
    
    # Extract date from the 'when' key
    date_part = metadata['when'].split('T')[0]  # Extract the date from the 'when' key
    
    # Format the filename by sanitizing the title to remove any special characters that could form paths
    safe_title = metadata['title'].replace('/', '_').replace('\\', '_')
    
    # Format the filename based on title, id, and date
    file_name = f"{safe_title}_{metadata['id']}_{date_part}.txt"
    
    # Ensure the directory exists
    os.makedirs(base_directory, exist_ok=True)
    
    # Full path to the file
    full_path = os.path.join(base_directory, file_name)
    
    # Wrap content to 80 characters wide
    wrapped_content = textwrap.fill(content, width=80)
    
    # Write the content to the file
    with open(full_path, 'w') as file:
        file.write(wrapped_content)

for doc in docs:
    write_page_to_file(content=doc.page_content, metadata=doc.metadata)

## Recursive character text splitting

In [None]:
# initialize splitter and split docs into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)
chunks = split_documents(splitter, docs)

## Initialize client for ChromaDB

In [None]:
#  a client that connects to a local chromadb server
chroma_settings = Settings(allow_reset=True)
chroma_client = chromadb.HttpClient(
    settings=chroma_settings,
)

# Generate and save embeddings in ChromaDB

Choose one of the options for creating embeddings.

## AWS Bedrock (embeddings)

In [None]:
from langchain_community.embeddings.bedrock import BedrockEmbeddings

embeddings_function = BedrockEmbeddings(
    credentials_profile_name=os.getenv("AWS_CREDENTIALS_PROFILE_NAME"),
    region_name=os.getenv("AWS_REGION_NAME", "eu-central-1"),
    model_id=os.getenv("AWS_EMBEDDING_MODEL_ID", "amazon.titan-text-express-v1")
)

## Ollama embeddings

- [Blog post about embedding models (by ollama)](https://ollama.com/blog/embedding-models)
- [Ollama embedding model (langchain docs)](https://python.langchain.com/docs/integrations/text_embedding/ollama/)

In [None]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "mxbai-embed-large")
embeddings_function = OllamaEmbeddings(model=OLLAMA_EMBEDDING_MODEL)

## Actually generate and save embeddings

In [None]:
import uuid

collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)
if collection.count() <= 0:
    for d in chunks:
        response = embeddings_function.embed_query(d.page_content)
        collection.add(
            ids=[str(uuid.uuid1())],
            embeddings=[response],
            documents=[d.page_content],
            metadatas=[d.metadata]
        )

# Test

In [None]:
from langchain_chroma import Chroma

chroma_settings = Settings(allow_reset=True)
chroma_client = chromadb.HttpClient(settings=chroma_settings)

db = Chroma(
    client=chroma_client, collection_name=COLLECTION_NAME, embedding_function=embeddings_function
)

question = "Was macht der Policy Reporter?"

In [None]:
from modules.retrieval import format_docs
from langchain_core.prompts import ChatPromptTemplate
from modules.retrieval import PROMPT_TEMPLATE_DE

retriever = db.as_retriever(search_kwargs={"k": 3})
relevant_docs = retriever.invoke(input=question)

context_text = format_docs(relevant_docs)
prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_DE)

## Initialize LLM

### Option 1: Bedrock

In [None]:
from langchain_aws.chat_models import ChatBedrock

CREDENTIALS_PROFILE_NAME = os.getenv("AWS_CREDENTIALS_PROFILE_NAME")
REGION_NAME = os.getenv("AWS_REGION_NAME", "eu-central-1")
AWS_LANGUAGE_MODEL_ID = os.getenv(
    "AWS_LANGUAGE_MODEL_ID", "amazon.titan-text-express-v1"
)
AWS_EMBEDDING_MODEL_ID = os.getenv(
    "AWS_EMBEDDING_MODEL_ID", "amazon.titan-embed-text-v1"
)

llm = ChatBedrock(
    credentials_profile_name=CREDENTIALS_PROFILE_NAME,
    region_name=REGION_NAME,
    model_id=AWS_LANGUAGE_MODEL_ID,
    model_kwargs={"temperature": 0.0, "maxTokenCount": 2048},
)

### Option 2: Ollama

In [None]:
from langchain_community.chat_models.ollama import ChatOllama

model_name = "mistral:7b"

ChatOllama(model=model_name, temperature=0)

## Generation

In [None]:
from langchain_core.output_parsers import StrOutputParser
import textwrap

chain = prompt | llm | StrOutputParser()
resp = chain.invoke({"context": context_text, "question": question})
print(textwrap.fill(resp, width=80))

In [None]:
from datetime import datetime

def convert_iso_to_readable(date_iso):
    # Parsing the ISO 8601 date string
    date_obj = datetime.fromisoformat(date_iso)
    
    # Formatting to a more readable form, e.g., "May 2, 2024, 7:12 PM"
    readable_date = date_obj.strftime("%B %d, %Y, %I:%M %p")
    
    return readable_date

print("Sources:\n")
for doc in relevant_docs:
   # print source
    print(f"Title: {doc.metadata['title']}")
    print(f"Link: {doc.metadata['source']}")
    print(f"Last edited: {convert_iso_to_readable(doc.metadata['when'])}")
    print("---")