# Chains

> Chains-based functions for Indexing.

In [None]:
# | default_exp indexing.chains


In [None]:
# | export

from langchain_ray.imports import *
from langchain_ray.chains import *
from langchain_ray.utils import *
from langchain_ray.indexing.utils import *
from langchain_ray.pdf.chains import *


In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# | export


def add_ems_to_docs_chain(
    ems_model,
    input_variables=["docs"],
    output_variables=["ems_docs"],
    verbose=False,
):
    "Chain that takes a list of `Documents` and adds `embeddings` to each `Document`."
    return transform_chain(
        add_ems_to_docs,
        transform_kwargs=dict(ems_model=ems_model),
        input_variables=input_variables,
        output_variables=output_variables,
        vars_kwargs_mapping={input_variables[0]: "docs"},
        verbose=verbose,
    )


def docs_to_faiss_chain(
    ems_model,
    index_folder,
    index_name,
    input_variables=["docs"],
    output_variables=["docs"],
    verbose=False,
):
    "Chain that takes a list of `Documents` and adds them to a `FAISS` index in `index_folder`."
    return transform_chain(
        docs_to_faiss,
        input_variables=input_variables,
        output_variables=output_variables,
        transform_kwargs={
            "ems_model": ems_model,
            "index_folder": index_folder,
            "index_name": index_name,
        },
        vars_kwargs_mapping={input_variables[0]: "docs"},
        verbose=verbose,
    )


def pdf_to_faiss_chain(
    ems_model,  # The model to use for vectorestore embeddings.
    index_folder,  # The folder to store the FAISS index.
    index_name,  # The name of the FAISS index.
    input_variables=["pdf_path"],
    output_variables=["docs"],
    chunk_size=200,  # The number of characters per Document.
    chunk_overlap=20,  # The number of characters to overlap between Documents.
    block_size=1500,  # The number of PDFs to process in a single Ray task.
    num_cpus=12,  # The number of CPUs to use for Ray.
    num_gpus=1,  # The number of GPUs to use for Ray.
    verbose=False,
):
    """
    Chain that adds PDFs to `FAISS` indexes in `index_folder`.
    If there are more than `block_size` PDFs, indexing will be distributed using `Ray`.
    """
    docs_chain = pdf_to_docs_chain(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        input_variables=input_variables,
        output_variables=["dc"],
        verbose=verbose,
    )
    faiss_chain = docs_to_faiss_chain(
        ems_model=ems_model,
        index_folder=index_folder,
        index_name=index_name,
        input_variables=["dc"],
        output_variables=output_variables,
        verbose=verbose,
    )
    faiss_chain = ray_chain(
        faiss_chain,
        block_size=block_size,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        verbose=verbose,
    )
    return SequentialChain(
        chains=[docs_chain, faiss_chain],
        input_variables=input_variables,
        output_variables=output_variables,
        verbose=verbose,
    )


def index_query_chain(
    ems_model,  # The SentenceTransformer model to use for vectorestore embeddings.
    index_folder,  # The folder with the FAISS indexes.
    index_name,  # The name of the FAISS index.
    input_variables=["query", "k"],
    output_variables=["search_results"],
    block_size=10,  # The number of indexes to process in a single Ray task.
    num_cpus=12,  # The number of CPUs to use for Ray.
    num_gpus=1,  # The number of GPUs to use for Ray.
    verbose=False,
):
    """
    Chain that takes a `query` and returns the top `k` results from the `FAISS` indexes in `index_folder`.
    If there are more than `block_size` indexes, search will be distributed using `Ray`.
    """

    index_names_chain = transform_chain(
        index_names,
        transform_kwargs={"index_folder": index_folder, "index_name": index_name},
        input_variables=["k"],
        output_variables=["index_names"],
    )

    search_faiss_chain = transform_chain(
        search_faiss,
        transform_kwargs={"index_folder": index_folder, "ems_model": ems_model},
        input_variables=["index_names", "query", "k"],
        output_variables=["res"],
    )

    search_faiss_chain = ray_chain(
        search_faiss_chain, block_size=block_size, num_cpus=num_cpus, num_gpus=num_gpus
    )

    def flatten_res(res, k):
        if is_list(k):
            k = k[0]
        return [sorted(flatten_list(res), key=lambda x: x[1])[:k]]

    res_chain = transform_chain(
        flatten_res,
        input_variables=["res", "k"],
        output_variables=output_variables,
    )

    return SequentialChain(
        chains=[index_names_chain, search_faiss_chain, res_chain],
        input_variables=input_variables,
        output_variables=output_variables,
        verbose=verbose,
    )


In [None]:
#| eval: false

ems_model = SentenceTransformer('HamzaFarhan/PDFSegs', device='cuda:0')

In [None]:
#| eval: false


verbose = True
ems_folder = "/media/hamza/data2/faiss_data/saved_ems"
chain1 = pdf_to_docs_chain(input_variables=["pdf"], output_variables=["docs"], verbose=verbose)
chain2 = add_ems_to_docs_chain(
    ems_model, input_variables=["docs"], output_variables=["ems_docs"], verbose=verbose
)
chain3 = docs_to_json_chain(
    ems_folder,
    with_content=False,
    input_variables=["ems_docs"],
    output_variables=["final_docs"],
    verbose=verbose,
)
chain = SequentialChain(
    chains=[chain1, chain2, chain3],
    input_variables=["pdf"],
    output_variables=["final_docs"],
    verbose=verbose,
)

In [None]:
#| eval: false

pdf = "../../resumes_5/"
res = chain(dict(pdf=pdf))



[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m


[1m> Entering new  chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


In [None]:
#| eval: false

doc = res["final_docs"][0][0]
print(doc.page_content)
print(doc.metadata)

kirtan shah || || toronto, on profile summary data engineer with 3+ years of experience developing etl pipelines for both structured and
{'source': '../../resumes_5/0cf20170-8051-41ba-9060-1a82d43f4289.pdf', 'page': 0, 'start_index': 0, 'embeddings': [0.11462773382663727, -0.002190949860960245, -0.004557551816105843, -0.020429404452443123, 0.0013719613198190928, -0.05107765644788742, -0.006046806927770376, -0.010813955217599869, -0.05173639580607414, -0.032382167875766754, 0.008773038163781166, -0.041238754987716675, -0.011339863762259483, 0.04549701511859894, -0.027355432510375977, 0.014588315971195698, 0.01907215639948845, 0.028187287971377373, -0.0191810242831707, 0.013248850591480732, -0.004099718295037746, 0.01132342591881752, -0.023586222901940346, -0.0026138199027627707, -0.022449364885687828, -0.0128311887383461, -0.047567009925842285, 0.041630521416664124, 0.03388316556811333, 0.023792514577507973, -0.01731789857149124, 0.004202561918646097, -0.030093584209680557, 0.0015716723

In [None]:
# | eval: false


ems_model = SentenceTransformerEmbeddings(
    model_name="HamzaFarhan/PDFSegs", model_kwargs={"device": "cuda:0"}
)
index_folder = "/media/hamza/data2/faiss_data/saved_indexes/"
index_name = "test"
chain = pdf_to_faiss_chain(
    ems_model=ems_model,
    index_folder=index_folder,
    index_name=index_name,
    input_variables=["pdf_path"],
    output_variables=["docs"],
    verbose=True,
)


In [None]:
# | eval: false


pdf = "../../resumes_5/"
res = chain(dict(pdf_path=pdf))


In [None]:
# | eval: false


res["docs"][0]


In [None]:
# | hide
import nbdev

nbdev.nbdev_export()
