In [None]:
import nest_asyncio
nest_asyncio.apply()

from llama_index.core import (StorageContext, VectorStoreIndex, TreeIndex,
                              SimpleDirectoryReader, load_index_from_storage, PromptTemplate)
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.retrievers.bm25 import BM25Retriever

import os
import config
import Stemmer
import pandas as pd

In [None]:
os.environ['OPENAI_API_KEY'] = config.key['API_key']
llm = OpenAI(model="gpt-4o")

In [None]:
SUMMARY_PROMPT = (
    "Context information from multiple sources is given below. \n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "You are a heritage preservation expert. The relevant information is about the material, age, state, deterioration, restauration, etc. of all elements and structures in the Lausanne cathedral.\n"
    "Summarize the key points from the given context.\n"
)

INSERT_PROMPT = (
    "The following hierarchy exists: \n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "It's about the Lausanne Cathedral and should follow the architectural, structural elements of the building similar to a BIM model: \n"
    "{new_info} \n"
    "Return the updated structure only."
)

In [None]:
documents = SimpleDirectoryReader("database/documents").load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
try:
    vector_storage_context = StorageContext.from_defaults(persist_dir="database/indexes/vector")
    vector_index = load_index_from_storage(vector_storage_context)
    print("vector store loaded")
except:
    print("Can't find persisted vector store, creating new index...")

    # Vector Index
    vector_index = VectorStoreIndex(
        nodes,
        show_progress=True,
        summary_prompt = PromptTemplate(SUMMARY_PROMPT),
    )    
    vector_index.storage_context.persist(persist_dir="database/indexes/vector")
    print("Vector store created")

try:
    bm25_retriever = BM25Retriever.from_persist_dir("database/indexes/bm25_retriever")
    print("bm25 store created")
except:
    print("Can't find persisted bm25 store, creating new index...")
    # We can pass in the index, docstore, or list of nodes to create the retriever
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=10,
        stemmer=Stemmer.Stemmer("english"),
        language="english",
    )
    bm25_retriever.persist("database/indexes/bm25_retriever")
    print("BM25 store created")

try:
    tree_storage_context = StorageContext.from_defaults(persist_dir="database/indexes/tree")
    tree_index = load_index_from_storage(tree_storage_context)
    print("tree store loaded")
except:
    print("Can't find persisted tree store, creating new index...")

    SUMMARY_PROMPT = PromptTemplate(SUMMARY_PROMPT)
    INSERT_PROMPT = PromptTemplate(INSERT_PROMPT)

    tree_index = TreeIndex(
        nodes,
        # storage_context=storage_context,
        insert_prompt=INSERT_PROMPT,
        summary_prompt=SUMMARY_PROMPT,
        show_progress=True,
    )
    tree_index.storage_context.persist(persist_dir="database/indexes/tree")
    print("Tree store created")

In [None]:
from llama_index.core.retrievers import QueryFusionRetriever

QUERY_GEN_PROMPT = """\
    You are a helpful assistant for a heritage preservation expert that generates multiple search queries based on a "
    single input query.\
    Users are interested in material, age, state, deterioration, restauration, etc. of all elements and structures in the Lausanne cathedral.\
    Unless otherwise specified by the user, it is always helpful to retrieve information at various levels of details, i.e. zoomed out to zoomed in to the specific element requested by the user.\
    Generate {num_queries} detailed search queries, one on each line related to the following input query:\
    Query: {query}
    Queries:
"""

retriever = QueryFusionRetriever(
    [vector_index.as_retriever(verbose=True), bm25_retriever, tree_index.as_retriever(verbose=True)],
    similarity_top_k=15,
    num_queries=4,
    use_async=True,
    verbose=True,
    query_gen_prompt=PromptTemplate(QUERY_GEN_PROMPT),
)

In [None]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

# configure response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="refine")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

Intuition questions

In [None]:
query = "What types and rates of degradation can be expected on the south tower of the transept over the next 50 years?" + " give a detailed answer with justifications for heritage preservation experts"
response = query_engine.query(query)
print(response)

In [None]:
query = "What types and rates of degradation can be expected on the south tower of the transept over the next 50 years?" + " give a detailed answer with justifications for heritage preservation experts"
response = query_engine.query(query)
print(response)

Knowledge questions

In [None]:
query = "Where did the Horses of Notre Dame of Lausanne, the bronze statues of four horses, originally come from?" + " give a precise, concise and truthful answer without inventing facts for heritage preservation experts"
response = query_engine.query(query)
print(response)

In [None]:
df = pd.read_csv('element_names.csv')

In [None]:
rag_df = pd.DataFrame(df['GivenOrientation GivenName GivenID'])

In [None]:
def get_response(element):
    print(element)
    query = f"What information is available concerning the material used for {element}, the date when it was built (current age), \
        current state and deterioration (including types and nature of the deterioration), \
        and restauration efforts (including the types materials used, and which deteriorations were targeted in which time period) for {element}"
    return query_engine.query(query)

rag_df['rag response'] = rag_df['GivenOrientation GivenName GivenID'].apply(get_response)
rag_df

In [None]:
rag_df.to_csv('results/rag_output.csv', index=False)

In [None]:
rag_df.to_excel('results/rag_output.xlsx', index=False)