# Multi representation indexing

Here we don't chunk up the documents. Instead we create summaries with important keywords, which are optimized for retrieval. We store the full document corresponding to the summary in a redis DB. Using a MultiVectorRetriever, we search for similar documents using the embedded summaries. But ultimately, the full document (parent document of the summary) is returned. Thus, we can later take advantage of LLMs with large context windows (like Amazon Titan Text Express wih 8k tokens) and provide them the full document to answer a users question, instead of a set of chunks, which may not contain sufficient information.  

# Environment

Provide `PYTHONPATH` (and other environment variables) in a `.env` file.

In [None]:
import os
import dotenv

# set env vars for confluence wiki
CONFLUENCE_PAT = os.getenv("CONFLUENCE_PAT")
CONFLUENCE_SPACE_KEY = os.getenv("CONFLUENCE_SPACE_KEY")
CONFLUENCE_URL = os.getenv("CONFLUENCE_URL")

dotenv.load_dotenv()

# Loading documents

In [None]:
from langchain_community.document_loaders.confluence import ConfluenceLoader
from modules.indexing import load_docs

# initialize Confluence document loader and 
# load documents from Confluence Wiki
loader = ConfluenceLoader(
    url=CONFLUENCE_URL,
    token=CONFLUENCE_PAT,
    cloud=False,
    space_key=CONFLUENCE_SPACE_KEY,
    include_attachments=False,
)
confluence_docs = load_docs(loader)

## Create document summaries

In [None]:
from langchain_aws.chat_models import ChatBedrock

# initialize bedrock LLM
bedrock_llm = ChatBedrock(
    credentials_profile_name=os.getenv("AWS_CREDENTIALS_PROFILE_NAME"),
    region_name=os.getenv("AWS_REGION_NAME", "eu-central-1"),
    model_id=os.getenv("AWS_LANGUAGE_MODEL_ID", "amazon.titan-text-express-v1"),
    model_kwargs={"temperature": 0.0, "maxTokenCount": 2048}
)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from modules.indexing import SUMMARY_PROMPT_TEMPLATE_DE

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template(SUMMARY_PROMPT_TEMPLATE_DE)
    | bedrock_llm
    | StrOutputParser()
)

# create summaries
summaries = chain.batch(confluence_docs)

## Initialize embedding model

In [None]:
from langchain_aws import BedrockEmbeddings

# init embeddings function (bedrock)
bedrock_embeddings = BedrockEmbeddings(
    credentials_profile_name=os.getenv("AWS_CREDENTIALS_PROFILE_NAME"),
    region_name=os.getenv("AWS_REGION_NAME", "eu-central-1"),
    model_id=os.getenv("AWS_EMBEDDING_MODEL_ID", "amazon.titan-text-express-v1")
)

## Index summaries and save parent docs

### Init vector store

In [None]:
from chromadb import HttpClient
from chromadb.config import Settings
from langchain_community.vectorstores import Chroma

# set collection name for the summaries
COLLECTION_NAME = f"{CONFLUENCE_SPACE_KEY}_concise_summaries"

# we need to init a client that connects to a local chromadb server
chroma_settings = Settings(allow_reset=True)
chroma_client = HttpClient(
    settings=chroma_settings,
)

# init the vectorstore to use to index the summaries
vectorstore = Chroma(
    client=chroma_client,
    collection_name=COLLECTION_NAME,
    embedding_function=bedrock_embeddings,
)

### Init document store

In [None]:
from langchain.storage.redis import RedisStore
from redis import Redis

# init redis client first
redis_client = Redis(host='localhost', port=6379)

# init the storage layer for the parent documents (full confluence pages)
doc_store = RedisStore(
    client=redis_client,
)

## Init MultiVectorRetriever

[MultiVectorRetriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector/) documentation from LangChain.

In [None]:
from langchain.retrievers import MultiVectorRetriever
from langchain_core.documents import Document

# id key for summary document
ID_KEY = "parent_doc_id"

# init retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=doc_store,
    id_key=ID_KEY,
    search_kwargs={"k": 1}
)

## Save embeddings and parent docs

In [None]:
parent_doc_ids = [doc.metadata['id'] for doc in confluence_docs]

# create a list of summary docs
# each element is of type Document
# - containing the summary as page_content
# - having a metadata property with a "parent_doc_id" corresponding do the id of the parent doc (full conflunce doc)
summary_docs = [
    Document(page_content=summary, metadata={ID_KEY: parent_doc_ids[i]})
    for i, summary in enumerate(summaries)
]

# Add embeddings of the summaries to the vector store
retriever.vectorstore.add_documents(summary_docs)

# add the parent document (full conflunce doc) and it's id to the document store
retriever.docstore.mset(list(zip(parent_doc_ids, confluence_docs)))

# Test

In [None]:
# define question
question = ""

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from modules.retrieval import PROMPT_TEMPLATE_DE, format_docs
import textwrap

# perform similarity search on the embeddings of the summaries
sub_docs = vectorstore.similarity_search(query=question, k=1)
print("Summary of the most similar document:")
# Wrap content to 80 characters wide
print(textwrap.fill(sub_docs[0].page_content, width=80))
print("-------------------------------------")

# retrieve the parent document
relevant_docs = retriever.invoke(input=question)
print("Contents of the actual confluence document:")
print(textwrap.fill(relevant_docs[0].page_content, width=80))

context_text = format_docs(relevant_docs)
prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_DE)

## Initialize LLM

### Option 1: Bedrock

We already initialized a Bedrock LLM when we creted the summaries.

### Option 2: Ollama

Choose a model with a context windof of at least 2048 tokens.

In [None]:
from langchain_community.chat_models.ollama import ChatOllama

model_name = "mistral:7b"
ChatOllama(model=model_name, temperature=0)

## Generation

In [None]:
from langchain_core.output_parsers import StrOutputParser

chain = prompt | bedrock_llm | StrOutputParser()
resp = chain.invoke({"context": context_text, "question": question})
print(textwrap.fill(resp, width=80))

In [None]:
from datetime import datetime

# print source
print(f"Title: {relevant_docs[0].metadata['title']}")
print(f"Link: {relevant_docs[0].metadata['source']}")

def convert_iso_to_readable(date_iso):
    # Parsing the ISO 8601 date string
    date_obj = datetime.fromisoformat(date_iso)
    
    # Formatting to a more readable form, e.g., "May 2, 2024, 7:12 PM"
    readable_date = date_obj.strftime("%B %d, %Y, %I:%M %p")
    
    return readable_date

print(f"Last edited: {convert_iso_to_readable(relevant_docs[0].metadata['when'])}")