# Chains

> Chains-based functions for PDFs.

In [None]:
# | default_exp pdf.chains

In [None]:
# | export

from langchain_ray.imports import *
from langchain_ray.chains import *
from langchain_ray.pdf.utils import *

In [None]:
# | export


def pdf_docs_chain(
    chunk_size=200, chunk_overlap=20, verbose=False, input_key="pdf_folder", output_key="df"
):
    "Chain that takes a PDF folder and returns a DataFrame of Documents."
    pdf_chain = transform_chain(create_pdf_df, input_key=input_key)
    docs_chain = transform_chain(
        df_pdf_docs,
        transform_kwargs={"chunk_size": chunk_size, "chunk_overlap": chunk_overlap},
    )
    return SimpleSequentialChain(
        chains=[pdf_chain, docs_chain],
        input_key=input_key,
        output_key=output_key,
        verbose=verbose,
    )


def pdf_cats_chain(cats_model, input_key="df", output_key="df"):
    "Chain that takes a DataFrame of Documents and adds categories using a SetFit model."
    return transform_chain(
        df_docs_cat,
        input_key=input_key,
        output_key=output_key,
        transform_kwargs={"cats_model": cats_model},
    )


def pdf_ems_chain(ems_model, ems_folder, input_key="df", output_key="df"):
    "Chain that takes a DataFrame of Documents and writes embeddings to `ems_folder` using `ems_model`."
    transform_chain(
        df_docs_ems,
        input_key=input_key,
        output_key=output_key,
        transform_kwargs={
            "ems_model": ems_model,
            "ems_folder": ems_folder,
        },
    )


def docs_faiss_chain(ems_model, index_folder, index_name, input_key="df", output_key="df"):
    "Chain that takes a DataFrame of Documents and adds them to a FAISS index in `index_folder`."
    return transform_chain(
        df_to_faiss,
        input_key=input_key,
        output_key=output_key,
        transform_kwargs={
            "ems_model": ems_model,
            "index_folder": index_folder,
            "index_name": index_name,
        },
    )


def pdf_faiss_chain(
    ems_model,  # The SentenceTransformer model to use for vectorestore embeddings.
    index_folder,  # The folder to store the FAISS index.
    index_name,  # The name of the FAISS index.
    input_key="pdf_folder",  # The input key for the PDF folder.
    output_key="df",  # The output key for the final DataFrame.
    chunk_size=200,  # The number of characters per Document.
    chunk_overlap=20,  # The number of characters to overlap between Documents.
    docs_block_size=1500,  # The number of Documents to process in a single Ray task.
    num_cpus=12,  # The number of CPUs to use for Ray.
    num_gpus=1,  # The number of GPUs to use for Ray.
    verbose=False,
):
    """
    Chain that takes a `pdf_folder` and adds them to FAISS indexes in `index_folder`.
    If there are more than `docs_block_size` Documents, it will be divided and distributed into multiple indexes using Ray.
    """
    docs_chain = pdf_docs_chain(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, input_key=input_key
    )
    faiss_chain = ray_chain(
        docs_faiss_chain(ems_model, index_folder, index_name),
        block_size=docs_block_size,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
    )
    return SimpleSequentialChain(
        chains=[docs_chain, faiss_chain],
        input_key=input_key,
        output_key=output_key,
        verbose=verbose,
    )


def index_query_chain(
    ems_model,  # The SentenceTransformer model to use for vectorestore embeddings.
    index_folder,  # The folder with the FAISS indexes.
    index_name,  # The name of the FAISS index.
    input_key="query",  # The input key for the query.
    output_key="search_results",  # The output key for the search results.
    k=2,  # The number of results to return.
    block_size=10,  # The number of indexes to process in a single Ray task.
    num_cpus=12,  # The number of CPUs to use for Ray.
    num_gpus=1,  # The number of GPUs to use for Ray.
    verbose=False,
):
    """
    Chain that takes a query and returns the top `k` results from the FAISS indexes in `index_folder`.
    If there are more than `block_size` indexes, search will be distributed using Ray.
    """
    q_df_chain = transform_chain(
        create_idx_q_df,
        input_key=input_key,
        transform_kwargs={"index_folder": index_folder, "index_name": index_name},
    )

    search_chain = transform_chain(
        lambda df: df.apply(df_search_faiss, axis=1, ems_model=ems_model, k=k),
    )

    res_chain = transform_chain(
        lambda df: sorted(flatten_list(df.results), key=lambda x: x[1])[:k],
    )

    return ray_chain(
        SimpleSequentialChain(
            chains=[q_df_chain, search_chain, res_chain],
            input_key=input_key,
            output_key=output_key,
            verbose=verbose,
        ),
        block_size=block_size,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
    )

In [None]:
# | export


def pdf_faiss_chain2(
    ems_model,  # The SentenceTransformer model to use for vectorestore embeddings.
    index_folder,  # The folder to store the FAISS index.
    index_name,  # The name of the FAISS index.
    input_key="pdf_folder",  # The input key for the PDF folder.
    output_key="df",  # The output key for the final DataFrame.
    chunk_size=200,  # The number of characters per Document.
    chunk_overlap=20,  # The number of characters to overlap between Documents.
    docs_block_size=1500,  # The number of Documents to process in a single Ray task.
    cats_model=None,  # The HuggingFace model to use for categorization.
    ems_chain_model=None,  # The SentenceTransformer model to use for chain embeddings.
    ems_folder=None,  # The folder to store the embeddings.
    verbose=False,
):
    """
    Chain that takes a PDF folder and adds them to FAISS indexes in `index_folder`. With optional categorization and chain embeddings.
    If there are more than `docs_block_size` Documents, it will be divided and distributed into multiple indexes using Ray.
    """
    chain1 = pdf_docs_chain(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, input_key=input_key
    )
    index_chains = []
    if cats_model is not None:
        cats_chain = pdf_cats_chain(cats_model)
        index_chains.append(cats_chain)
    if ems_folder is not None and ems_chain_model is not None:
        ems_chain = pdf_ems_chain(ems_chain_model, ems_folder)
        index_chains.append(ems_chain)

    faiss_chain = docs_faiss_chain(ems_model, index_folder, index_name)
    index_chains.append(faiss_chain)
    chain2 = ray_chain(
        SimpleSequentialChain(chains=index_chains),
        block_size=docs_block_size,
        cuda=True,
    )
    return SimpleSequentialChain(
        chains=[chain1, chain2], input_key=input_key, output_key=output_key, verbose=verbose
    )

## Usage Example

Then we load our embeddings model using LangChain's `SentenceTransformerEmbeddings`.

In [None]:
# | eval: false

device = "cuda"
model_name = "HamzaFarhan/PDFSegs"

ems_model = SentenceTransformerEmbeddings(
    model_name=model_name, model_kwargs={"device": device}
)

Then we define the `index_folder` and `index_name`

In [None]:
# | eval: false
# | output: false


data_folder = Path("/media/hamza/data2/faiss_data/")
index_folder = data_folder / "saved_indexes"
index_name = "chain_index"

In [None]:
# | hide
# | eval: false


for f in index_folder.glob(f"{index_name}*"):
    f.unlink()

Then we create a chain for creating FAISS index(es).

<br>We're using job resumes in our example and we want to split the text into chunks of 3 lines. A job resume typically has 60-80 characters per line, so we set `chunk_size` to 200. So for each PDF, we'll have (number of lines / 3) `Documents`.

<br>Also, let's suppose we have thousands of extracted `Documents` and  we want to parallelize the indexing process.
<br>That's where `docs_block_size` comes in. It's the number of `Documents` that will be indexed in parallel using `Ray` tasks. Each task will create a separate FAISS index.
<br>You can pass the `num_cpus` and `num_gpus` arguments to specify the number of CPUs and GPUs to use for indexing. Those resources will be distributed evenly across the tasks.


In [None]:
# | eval: false
# | output: false


verbose = True

faiss_chain = pdf_faiss_chain(
    ems_model=ems_model,
    index_folder=index_folder,
    index_name=index_name,
    chunk_size=200,
    chunk_overlap=20,
    docs_block_size=1500,
    num_cpus=4,
    num_gpus=0.4,
    verbose=verbose,
)

Let's run the chain on a sample folder of 5 PDFs.

In [None]:
# | eval: false

pdf_folder = Path("../../resumes_5/")

faiss_df = faiss_chain.run(pdf_folder)



[1m> Entering new  chain...[0m
[36;1m[1;3m                                                   doc
0    page_content='Kirtan Shah (647) 997-9805 || ki...
1    page_content='both structured and unstructured...
2    page_content='Well-versed in database design &...
3    page_content='Built automated scripts to verif...
4    page_content='Databases & Libraries: Snowﬂake,...
..                                                 ...
143  page_content='specifications. \n Developed sec...
144  page_content='Developed the front end using HT...
145  page_content='Designed the databases and creat...
146  page_content='SQL Server. \n Configure Java se...
147  page_content='Jawaharlal Nehru Technological U...

[148 rows x 1 columns][0m
[33;1m[1;3m                                                   doc
0    page_content='Kirtan Shah (647) 997-9805 || ki...
1    page_content='both structured and unstructured...
2    page_content='Well-versed in database design &...
3    page_content='Built automa

The chain returned a DataFrame with the extracted `Documents`.
<br>Let's look at one of the extracted `Documents`.

In [None]:
# | eval: false

doc = faiss_df.iloc[1].doc
print_doc(doc)

[1mPage_Content:[0m both structured and unstructured datasets.

[1mMetadata:[0m {'source': '../../resumes_5/0cf20170-8051-41ba-9060-1a82d43f4289.pdf', 'page': 0, 'start_index': 171}



In [None]:
# | eval: false

print(len(faiss_df))

148


There were only 148 `Documents`. So Ray was not used. We can lower the `docs_block_size` to force Ray to be used.

In [None]:
# | hide
# | eval: false


for f in index_folder.glob(f"{index_name}*"):
    f.unlink()

In [None]:
# | eval: false

faiss_chain2 = pdf_faiss_chain(
    ems_model=ems_model,
    index_folder=index_folder,
    index_name=index_name,
    chunk_size=200,
    chunk_overlap=20,
    docs_block_size=50,  # Changed
    num_cpus=4,
    num_gpus=0.4,
    verbose=verbose,
)

In [None]:
# | eval: false
# | output: false


faiss_df2 = faiss_chain2.run(pdf_folder)



[1m> Entering new  chain...[0m
[36;1m[1;3m                                                   doc
0    page_content='Kirtan Shah (647) 997-9805 || ki...
1    page_content='both structured and unstructured...
2    page_content='Well-versed in database design &...
3    page_content='Built automated scripts to verif...
4    page_content='Databases & Libraries: Snowﬂake,...
..                                                 ...
143  page_content='specifications. \n Developed sec...
144  page_content='Developed the front end using HT...
145  page_content='Designed the databases and creat...
146  page_content='SQL Server. \n Configure Java se...
147  page_content='Jawaharlal Nehru Technological U...

[148 rows x 1 columns][0m
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLEL

2023-07-07 22:46:12,379	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[38;5;4mℹ Running chain on 3 blocks.[0m




Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m
2023-07-07 22:46:14,353	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Repartition] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-07-07 22:46:14,354	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-07-07 22:46:14,354	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Repartition 1:   0%|          | 0/3 [00:00<?, ?it/s]

Repartition 2:   0%|          | 0/3 [00:00<?, ?it/s]

Running 0:   0%|          | 0/3 [00:00<?, ?it/s]

2023-07-07 22:46:21,307	INFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-8, stopped daemon 140531103418112)>.


[33;1m[1;3m                                                   doc
0    page_content='Kirtan Shah (647) 997-9805 || ki...
1    page_content='both structured and unstructured...
2    page_content='Well-versed in database design &...
3    page_content='Built automated scripts to verif...
4    page_content='Databases & Libraries: Snowﬂake,...
..                                                 ...
143  page_content='specifications. \n Developed sec...
144  page_content='Developed the front end using HT...
145  page_content='Designed the databases and creat...
146  page_content='SQL Server. \n Configure Java se...
147  page_content='Jawaharlal Nehru Technological U...

[148 rows x 1 columns][0m

[1m> Finished chain.[0m


It's that simple! We can now use the FAISS indexes to search for similar Documents.

Create an `index_query_chain`.

In [None]:
# | eval: false

query_chain = index_query_chain(
    ems_model=ems_model,
    index_folder=index_folder,
    index_name=index_name,
    k=2,
    block_size=10,
    num_cpus=4,
    num_gpus=0.4,
    verbose=verbose,
)

In [None]:
# | eval: false

query = "I got my degree from the University of Toronto"
search_res = query_chain.run(query)



[1m> Entering new  chain...[0m
[36;1m[1;3m                                  index_folder     index_name  \
0  /media/hamza/data2/faiss_data/saved_indexes  chain_index_2   
1  /media/hamza/data2/faiss_data/saved_indexes    chain_index   
2  /media/hamza/data2/faiss_data/saved_indexes  chain_index_1   

                                            query  
0  I got my degree from the University of Toronto  
1  I got my degree from the University of Toronto  
2  I got my degree from the University of Toronto  [0m
[33;1m[1;3m                                  index_folder     index_name  \
0  /media/hamza/data2/faiss_data/saved_indexes  chain_index_2   
1  /media/hamza/data2/faiss_data/saved_indexes    chain_index   
2  /media/hamza/data2/faiss_data/saved_indexes  chain_index_1   

                                            query  \
0  I got my degree from the University of Toronto   
1  I got my degree from the University of Toronto   
2  I got my degree from the University of Toro

In [None]:
#| eval: false

print("Search Results:\n")
for doc in search_res:
    print(f"+{'-'*100}+")
    print()
    print_doc(doc[0])

Search Results:

+----------------------------------------------------------------------------------------------------+

[1mPage_Content:[0m Bachelor of Commerce (B. Com) - University of Mumbai 2008 - 2011

[1mMetadata:[0m {'source': '../../resumes_5/0cf20170-8051-41ba-9060-1a82d43f4289.pdf', 'page': 0, 'start_index': 3474}

+----------------------------------------------------------------------------------------------------+

[1mPage_Content:[0m in 1997 
 
 B.A. from Punjab University, Lahore 
in 1991 
 
 
CE R T I F I C A T I O N S :
 
 
 CTLP (Certified Trade & Logistics 
Professional) from Dubai World, 
Dubai - UAE in 2012

[1mMetadata:[0m {'source': '../../resumes_5/0f479ee8-5fd9-4f55-b254-5e8feef08038.pdf', 'page': 0, 'start_index': 356}



In [None]:
# | hide
import nbdev

nbdev.nbdev_export()

[2m[33m(raylet)[0m [2023-07-07 22:46:22,344 E 499132 499144] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-07-07_22-46-10_723096_498908 is over 95% full, available space: 23693135872; capacity: 502392610816. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-07-07 22:46:32,356 E 499132 499144] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-07-07_22-46-10_723096_498908 is over 95% full, available space: 23692996608; capacity: 502392610816. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-07-07 22:46:42,367 E 499132 499144] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-07-07_22-46-10_723096_498908 is over 95% full, available space: 23692918784; capacity: 502392610816. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-07-07 22:46:52,380 E 499132 499144] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-07-07_22-46-10_723096_498908 is over 95% full, avail