# Chains

> Chains-based functions for PDFs.

In [None]:
# | default_exp pdf.chains

In [None]:
# | export

from langchain_ray.imports import *
from langchain_ray.chains import *
from langchain_ray.utils import *
from langchain_ray.pdf.utils import *

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


def pdf_to_docs_chain(
    splitter=None,
    chunk_size=200,
    chunk_overlap=20,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""],
    add_start_index=True,
    proc=True,
    input_variables=["path"],
    output_variables=["docs"],
    verbose=False,
):
    """Chain that returns a list of `Documents` extracted from a PDF path.
    The path can be a single PDF path or a list of paths or a directory path."""
    return transform_chain(
        pdf_to_docs,
        transform_kwargs=dict(
            splitter=splitter,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators,
            add_start_index=add_start_index,
            proc=proc,
        ),
        vars_kwargs_mapping={input_variables[0]: "path"},
        input_variables=input_variables,
        output_variables=output_variables,
        verbose=verbose,
    )


def add_cats_to_docs_chain(
    cats_model,
    input_variables=["docs"],
    output_variables=["cat_docs"],
    verbose=False,
):
    "Chain that adds the categories to a list of `Documents` using `cats_model`."
    return transform_chain(
        add_cats_to_docs,
        transform_kwargs=dict(cats_model=cats_model),
        input_variables=input_variables,
        output_variables=output_variables,
        vars_kwargs_mapping={input_variables[0]: "docs"},
        verbose=verbose,
    )

In [None]:
# | eval: false

verbose = True
cats_model = SetFitModel.from_pretrained("HamzaFarhan/PDFSegs").to("cuda:0")
cats_folder = "/media/hamza/data2/faiss_data/saved_cats"
if os.path.exists(cats_folder):
    shutil.rmtree(cats_folder)

In [None]:
# | eval: false

chain1 = pdf_to_docs_chain(
    input_variables=["path"], output_variables=["docs"], verbose=verbose
)
chain2 = add_cats_to_docs_chain(
    cats_model=cats_model,
    input_variables=["docs"],
    output_variables=["cat_docs"],
    verbose=verbose,
)
chain2 = ray_chain(
    chain=chain2,
    block_size=2,
    num_cpus=6,
    num_gpus=0.6,
    verbose=verbose,
)
chain3 = docs_to_json_chain(
    json_folder=cats_folder,
    input_variables=["cat_docs"],
    output_variables=["json_docs"],
    verbose=verbose,
)
chain = SequentialChain(
    chains=[chain1, chain2, chain3],
    input_variables=["path"],
    output_variables=["json_docs"],
    verbose=verbose,
)

In [None]:
# | eval: false

pdf = "../../resumes_5/"
res = chain(dict(path=pdf))



[1m> Entering new SequentialChain chain...[0m


[1m> Entering new TransformChain chain...[0m

[1m> Finished chain.[0m


[1m> Entering new TransformChain chain...[0m


2023-08-12 02:13:21,472	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m



[38;5;4mℹ Running chain on 3 blocks.[0m



2023-08-12 02:13:23,593	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Repartition] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-08-12 02:13:23,594	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-08-12 02:13:23,595	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- Repartition 1:   0%|          | 0/3 [00:00<?, ?it/s]

Split Repartition 2:   0%|          | 0/3 [00:00<?, ?it/s]

Running 0:   0%|          | 0/3 [00:00<?, ?it/s]

[2m[36m(MapBatches(<lambda>) pid=122934)[0m 
[2m[36m(MapBatches(<lambda>) pid=122934)[0m 
[2m[36m(MapBatches(<lambda>) pid=122934)[0m [1m> Entering new TransformChain chain...[0m
[2m[36m(MapBatches(<lambda>) pid=122934)[0m [1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new TransformChain chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


In [None]:
# | eval: false

doc = res["json_docs"][0][-1]
print(doc.metadata)
print(doc.page_content)

{'source': '../../resumes_5/0bedb223-262c-4388-9756-093dd7905428.pdf', 'page': 1, 'start_index': 1327, 'category': 'Work Experience'}
* red hat openshift workshop - containers and kubernetes for developer
* google cloud fundamentals: core infrastructure


In [None]:
# | hide
import nbdev

nbdev.nbdev_export()