# Mapper

In [None]:
# | default_exp mapper


In [None]:
# | export

from dreamai_ray.imports import *
from dreamai_ray.utils import *

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


class Callback:
    "Base class for callbacks for `Mapper`."

    def before_init(self, **kwargs):
        pass

    def after_init(self, **kwargs):
        pass
    
    def before_batch(self, **kwargs):
        pass

    def before_batch_rows(self, **kwargs):
        pass

    def after_batch_rows(self, **kwargs):
        pass

    def after_batch(self, **kwargs):
        pass


class msg_cb(Callback):
    def __init__(self, target=[""]):
        self.target = target

    def printer(self, cls, **kwargs):
        for t in self.target:
            if hasattr(cls, t):
                msg.info(f"MSG CB: {getattr(cls, t)}", spaced=True)

    def before_init(self, cls, **kwargs):
        self.printer(cls, **kwargs)

    def before_batch(self, cls, **kwargs):
        self.printer(cls, **kwargs)

    def before_batch_rows(self, cls, **kwargs):
        self.printer(cls, **kwargs)

    def after_batch_rows(self, cls, **kwargs):
        self.printer(cls, **kwargs)

    def after_batch(self, cls, **kwargs):
        self.printer(cls, **kwargs)


class block_counter_cb(Callback):
    "A `Callback` to count the number of blocks."

    def __init__(self, verbose=True):
        self.verbose = verbose

    def before_init(self, cls, verbose=True, **kwargs):
        cls.block_counter = 0
        if self.verbose:
            msg.info(f"BLOCK COUNTER: {cls.block_counter}", spaced=True)

    def after_batch(self, cls, verbose=True, **kwargs):
        cls.block_counter += 1
        if self.verbose:
            msg.info(f"BLOCK COUNTER: {cls.block_counter}", spaced=True)


class msg_bs_cb(Callback):
    "A `Callback` to print the batch size."

    def before_batch(self, df, **kwargs):
        msg.info(f"DF BATCH SIZE: {len(df)}", spaced=True)


def init_cb(cb):
    if type(cb).__name__ == "type":
        cb = cb()
    return cb


def init_cbs(cbs):
    return [init_cb(cb) for cb in cbs]


def cbs_before_init(cbs, **kwargs):
    [init_cb(cb).before_init(**kwargs) for cb in cbs]


def cbs_after_init(cbs, **kwargs):
    [init_cb(cb).after_init(**kwargs) for cb in cbs]


def cbs_before_batch(cbs, **kwargs):
    [init_cb(cb).before_batch(**kwargs) for cb in cbs]


def cbs_before_batch_rows(cbs, df, **kwargs):
    [init_cb(cb).before_batch_rows(**kwargs) for _ in range(len(df)) for cb in cbs]


def cbs_after_batch_rows(cbs, df, **kwargs):
    [init_cb(cb).after_batch_rows(**kwargs) for _ in range(len(df)) for cb in cbs]


def cbs_after_batch(cbs, **kwargs):
    [init_cb(cb).after_batch(**kwargs) for cb in cbs]


class Mapper:
    """
    A class to map a function to a dataframe.
    The function should expect a dataframe as input and return a dataframe as output.
    """

    def __init__(
        self,
        udf=noop,  # A function that takes a dataframe as input and returns a dataframe as output.
        udf_kwargs={},  # The keyword arguments to pass to the `udf`.
        cbs=[],  # A list of `Callback`s to run before and after the mapping. It will add `block_counter_cb` and `msg_bs_cb` by default.
        **kwargs,
    ):
        # cbs = init_cbs(cbs)
        cb_types = [type(cb).__name__ for cb in cbs]
        default_cbs = []
        if block_counter_cb.__name__ not in cb_types:
            default_cbs.append(block_counter_cb())
        if msg_bs_cb.__name__ not in cb_types:
            default_cbs.append(msg_bs_cb())
        cbs = default_cbs + cbs
        cbs_before_init(cbs, cls=self)
        udf = partial(udf, **udf_kwargs)
        store_attr(**locals_to_params(locals()))
        cbs_after_init(cbs, cls=self)

    def map(self, df):
        return df.apply(self.udf, axis=1, result_type="expand")

    def __call__(self, df):
        cbs_before_batch(self.cbs, cls=self, df=df)
        cbs_before_batch_rows(self.cbs, cls=self, df=df)

        df = self.map(df)

        cbs_after_batch_rows(self.cbs, cls=self, df=df)
        cbs_after_batch(self.cbs, cls=self, df=df)

        return df

## Usage Example

In [None]:
df = pd.DataFrame(
    {
        "text": [
            ["I worked at Google", "I studied at Harvard"],
            ["I worked at Facebook", "I studied at MIT"],
        ]
    }
)


def ander(df):
    """
    A simple example of a user defined function that joins the text in each row with " and "
    """
    df["joined"] = " and ".join(df["text"])
    return df


m = Mapper(udf=ander)
df = m(df)
df



[38;5;4mℹ BLOCK COUNTER: 0[0m


[38;5;4mℹ DF BATCH SIZE: 2[0m


[38;5;4mℹ BLOCK COUNTER: 1[0m



Unnamed: 0,text,joined
0,"[I worked at Google, I studied at Harvard]",I worked at Google and I studied at Harvard
1,"[I worked at Facebook, I studied at MIT]",I worked at Facebook and I studied at MIT


In [None]:
# | hide

d = [
    {"text": ["I worked at Google", "I studied at Harvard"]},
    {"text": ["I worked at Facebook", "I studied at MIT"]},
]

d2 = [
    {
        "text": ["I worked at Google", "I studied at Harvard"],
        "joined": "I worked at Google and I studied at Harvard",
    },
    {
        "text": ["I worked at Facebook", "I studied at MIT"],
        "joined": "I worked at Facebook and I studied at MIT",
    },
]


def and_fn(d):
    # Enter your code below that modifys `d` and returns it.

    return d


In [None]:
# | hide

import nbdev

nbdev.nbdev_export()