In [None]:
import DeepLakeVectorStore
def create_upload_vectore_store(
    chunked_text: list,
    vector_store_path: Union[str, os.PathLike],
    filename: str,
    metadata: Optional[list[dict]] = None,
):
    vector_store = DeepLakeVectorStore(
        dataset_path=vector_store_path,
        runtime={"tensor_db": True},
        overwrite=True,
        tensor_params=[
            {"name": "text", "htype": "text"},
            {"name": "embedding", "htype": "embedding"},
            {"name": "filename", "htype": "text"},
            {"name": "metadata", "htype": "json"},
        ],
    )
    vector_store = vector_store.vectorstore
    vector_store.add(
        text=chunked_text,
        embedding_function=embedding_function_text,
        filename=filename,
        embedding_data=chunked_text,
        rate_limiter={
            "enabled": True,
            "bytes_per_minute": 1500000,
            "batch_byte_size": 10000,
        },
        metadata=metadata if metadata else None,
    )

In [None]:
def get_index_and_nodes_after_visual_similarity(filenames: list):
    vector_store = load_vector_store(vector_store_path=VECTOR_STORE_PATH_DESCRIPTION)

    conditions = " or ".join(f"filename == '{name}'" for name in filenames)
    tql_query = f"select * where {conditions}"

    filtered_elements = vector_store.vectorstore.search(query=tql_query)
    chunks = []
    for el in filtered_elements["text"]:
        chunks.append(el)

    string_iterable_reader = download_loader("StringIterableReader")
    loader = string_iterable_reader()
    documents = loader.load_data(texts=chunks)
    node_parser = SimpleNodeParser.from_defaults(separator="\n")
    nodes = node_parser.get_nodes_from_documents(documents)

    # To ensure same id's per run, we manually set them.
    for idx, node in enumerate(nodes):
        node.id_ = f"node_{idx}"

    llm = OpenAI(model="gpt-4")

    service_context = ServiceContext.from_defaults(llm=llm)
    index = VectorStoreIndex(nodes=nodes)
    return index, nodes, service_context, filtered_elements

In [None]:
_, nodes, service_context = get_index_and_nodes_from_activeloop(
    vector_store_path=VECTOR_STORE_PATH_BASELINE
)

In [None]:
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)

In [None]:
nodes_bm25_response = bm25_retriever.retrieve(description)

In [None]:
class ClassicRetrieverBM25(BaseRetriever):
    def __init__(self, bm25_retriever):
        self.bm25_retriever = bm25_retriever
        super().__init__()

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes

In [None]:
reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")

# nodes retrieved by the bm25 retriever with the reranker
reranked_nodes_bm25 = reranker.postprocess_nodes(
    nodes_bm25_response,
    query_bundle=QueryBundle(QUERY),
)

In [None]:
index, nodes, _ = get_index_and_nodes_from_activeloop(
    vector_store_path=VECTOR_STORE_PATH_COMPLETE_SEQUENTIALLY
)
self.vector_retriever = index.as_retriever(similarity_top_k=2)
self.bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes, similarity_top_k=10
)

In [None]:
reranked_nodes_bm25 = self.reranker.postprocess_nodes(
        self.nodes_bm25_response,
        query_bundle=QueryBundle(QUERY),
    )
print("Reranked Nodes BM25\n\n")
for el in reranked_nodes_bm25:
    print(f"{el.score}\n")

reranked_nodes_vector = self.reranker.postprocess_nodes(
self.nodes_vector_response,
query_bundle=QueryBundle(QUERY),
)
print("Reranked Nodes Vector\n\n")
for el in reranked_nodes_vector:
    print(f"{el.score}\n")
    unique_nodes = keep_best_k_unique_nodes(
        reranked_nodes_bm25, reranked_nodes_vector
    )
    print("Unique Nodes\n\n")
for el in unique_nodes:
    print(f"{el.id} : {el.score}\n")

In [None]:
from llama_index.retrievers import BM25Retriever

vector_retriever = index.as_retriever(similarity_top_k=2)

bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore, similarity_top_k=2
)

In [None]:
from llama_index.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=2,
    num_queries=4,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
)

In [None]:
retriever.retrieve(description)