# PDF Mappers

In [None]:
# | default_exp pdf.mappers


In [None]:
# | export

from dreamai_ray.imports import *
from dreamai_ray.utils import *
from dreamai_ray.mapper import *
from dreamai_ray.pdf.extract import *
from dreamai_ray.pdf.df import *

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


class SegsMapper(Mapper):
    """
    A custom mapper for PDF segmentation. It uses the `df_segs` function to extract the segments from the PDF.
    """

    def __init__(
        self,
        segs_model="HamzaFarhan/PDFSegs",
        udf=df_segs,
        udf_kwargs=dict(
            thresh=0.6,
            classes=["Work Experience", "Education", "Certifications", "Other"],
            other_class="Other",
        ),
        **kwargs,
    ):
        """
        Parameters
        ----------
        segs_model : str, optional
            The name of the model to use for segmentation, by default "HamzaFarhan/PDFSegs"
        udf : function, optional
            The function to use for extracting the segments, by default df_segs
        udf_kwargs : dict, optional
            The keyword arguments to pass to the `udf` function.
        """
        segs_model = load_segs_model(segs_model, device=default_device())
        udf_kwargs["segs_model"] = segs_model
        super().__init__(**locals_to_params(locals()))

In [None]:
# | eval: false

df = pd.DataFrame(
    {
        "text": [
            ["I worked at Google", "I studied at Harvard"],
            ["I worked at Facebook", "I studied at MIT"],
        ]
    }
)

m = SegsMapper()
df = m(df)
df


[38;5;4mℹ DF BATCH SIZE: 2[0m



Unnamed: 0,text,segs,preds,probs
0,"[I worked at Google, I studied at Harvard]","{'Work Experience': ['I worked at Google'], 'Education': ['I studied at Harvard'], 'Certifications': [], 'Other': []}","[Work Experience, Education]","[0.98256487, 0.98468]"
1,"[I worked at Facebook, I studied at MIT]","{'Work Experience': ['I worked at Facebook'], 'Education': ['I studied at MIT'], 'Certifications': [], 'Other': []}","[Work Experience, Education]","[0.9836285, 0.98444146]"


In [None]:
# | hide

import nbdev

nbdev.nbdev_export()