In [None]:
import os

import numpy as np
from langchain_cohere import CohereRerank

from llama_index.core import Document
from llama_index.core import SimpleDirectoryReader
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import get_leaf_nodes
from llama_index.core import StorageContext
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core import  load_index_from_storage
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core import load_index_from_storage
from llama_index.core.node_parser import SentenceWindowNodeParser

from llama_index.core.schema import NodeWithScore
from copy import deepcopy
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core import QueryBundle
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.response.notebook_utils import display_response


from llama_index.llms.openai import OpenAI

In [None]:
llm=llm
embeddings=embeddings

In [None]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embeddings

In [None]:
from llama_index.core import SimpleDirectoryReader
# load documents
documents = SimpleDirectoryReader("valid").load_data()

SENTENCE WINDOW RETRIEVAL

In [None]:
def build_sentence_window_index(
    documents,
    llm,
    embed_model=embeddings,
    sentence_window_size=3,
    save_dir="sentence_index",
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index



In [None]:

def get_sentence_window_query_engine(
    sentence_index, similarity_top_k=6, rerank_top_n=2
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = CohereRerank(top_n=rerank_top_n,model="rerank-english-v3.0")
    # rerank = SentenceTransformerRerank(
    #     top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    # )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [None]:
index = build_sentence_window_index(
    documents,
    llm=llm,
    save_dir="./sentence_index",
)

In [None]:
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)

In [None]:
response=query_engine.query(question)

AUTOMERGING RETRIEVAL 

In [None]:
def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    merging_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=merging_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=merging_context,
        )
    return automerging_index




In [None]:
def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=6,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

In [None]:
index_auto= build_automerging_index(
    documents,
    llm=llm,
)

In [None]:
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)

In [None]:
response=query_engine.query(question)