# Mappers

> Mapper classes for PDF information extraction.

In [None]:
# | default_exp pdf.mappers

In [None]:
# | export

from dreamai_ray.imports import *
from dreamai_ray.utils import *
from dreamai_ray.mapper import *
from dreamai_ray.pdf.extract import *
from dreamai_ray.pdf.df import *


In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


class SegsMapper(Mapper):
    """
    A custom mapper for PDF segmentation. It uses the `df_segs` function to extract the segments from the PDF.
    """

    def __init__(
        self,
        segs_model="HamzaFarhan/PDFSegs",  # The name of the model to use for segmentation.
        udf=df_segs,  # The function to use for extracting the segments.
        udf_kwargs=dict(  # The keyword arguments to pass to the `udf`.
            thresh=0.6,
            classes=["Work Experience", "Education", "Certifications", "Other"],
            other_class="Other",
        ),
        **kwargs,
    ):
        segs_model = load_segs_model(segs_model, device=default_device())
        udf_kwargs["segs_model"] = segs_model
        super().__init__(**locals_to_params(locals()))


## Usage Example

In [None]:
# | eval: false

df = pd.DataFrame(
    {
        "text": [
            [
                "I worked at Google",
                "I studied at Harvard",
                "I a have a google cloud certification",
            ],
            [
                "I worked at Facebook",
                "I studied at MIT",
                "I have a salesforce certification",
            ],
        ]
    }
)

m = SegsMapper()
df = m(df)
df


[38;5;4mℹ DF BATCH SIZE: 2[0m



Unnamed: 0,text,segs,preds,probs
0,"[I worked at Google, I studied at Harvard, I a have a google cloud certification]","{'Work Experience': ['I worked at Google'], 'Education': ['I studied at Harvard'], 'Certifications': ['I a have a google cloud certification'], 'Other': []}","[Work Experience, Education, Certifications]","[0.98256487, 0.98468, 0.9786084]"
1,"[I worked at Facebook, I studied at MIT, I have a salesforce certification]","{'Work Experience': ['I worked at Facebook'], 'Education': ['I studied at MIT'], 'Certifications': [' I have a salesforce certification'], 'Other': []}","[Work Experience, Education, Certifications]","[0.9836285, 0.98444146, 0.9522135]"


In [None]:
# | eval: false

print_segments(df["segs"][0])

Work Experience: 1
	'I worked at Google'

--------------------------------------------------------------------------------------------------------
Education: 1
	'I studied at Harvard'

--------------------------------------------------------------------------------------------------------
Certifications: 1
	'I a have a google cloud certification'

--------------------------------------------------------------------------------------------------------
Other: 0
--------------------------------------------------------------------------------------------------------


In [None]:
# | hide

import nbdev

nbdev.nbdev_export()
