In [None]:
# default_exp preproc_decorator
import os
%load_ext autoreload
%autoreload 2
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


# Preprocessing Decorator

A decorator to simplify data preprocessing

## Imports

In [None]:
# export
from typing import Any, Callable, Iterable, Generator

from loguru import logger
from fastcore.basics import chunked, listify, partial
from fastcore.parallel import num_cpus
from joblib import Parallel, delayed
import pandas as pd

from m3tl.bert_preprocessing.create_bert_features import \
    create_multimodal_bert_features
from m3tl.special_tokens import TRAIN, PREDICT
from m3tl.utils import (get_or_make_label_encoder,
                        load_transformer_tokenizer, set_is_pyspark)
from m3tl.params import Params


## Decorator utils functions

In [None]:
# export
def has_key_startswith(d: dict, prefix: str) -> bool:
    for k in d.keys():
        if k.startswith(prefix):
            return True
    return False


def convert_legacy_output(inp: Generator[tuple, None, None]) -> dict:
    """Convert legacy preproc output to dictionary

    Args:
        inp (Generator[tuple, None, None]): legacy format output

    Returns:
        dict: new format output

    Yields:
        Iterator[dict]
    """
    for record in inp:

        if isinstance(record, dict):
            yield record
        else:
            inputs, labels = record

            # need to do conversion
            if isinstance(inputs, dict) and not has_key_startswith(inputs, 'inputs_'):
                new_format_record = {'inputs_{}'.format(
                    k): v for k, v in inputs.items()}
            elif isinstance(inputs, dict):
                new_format_record = inputs
            else:
                new_format_record = {'inputs_text': inputs}

            if isinstance(labels, dict) and not has_key_startswith(labels, 'labels_'):
                new_format_record.update({
                    'labels_{}'.format(k): v for k, v in labels.items()
                })
            elif isinstance(labels, dict):
                new_format_record.update(labels)
            else:
                new_format_record['labels'] = labels
            yield new_format_record


def input_format_check(inp: dict, mode: str):
    if not isinstance(inp, dict):
        raise ValueError(
            "preproc outout content should be dict, got: {}".format(type(inp)))

    inputs_columns = [k for k in inp.keys() if k.startswith('inputs')]
    if not inputs_columns:
        raise ValueError(
            'inputs should has key with prefix "inputs", keys: {}'.format(inp.keys()))

    if mode != PREDICT:
        labels_columns = [k for k in inp.keys() if k.startswith('labels')]
        if not labels_columns:
            raise ValueError(
                'inputs should has key with prefix "labels", keys: {}'.format(inp.keys()))


In [None]:
# hide
from m3tl.utils import get_phase
from m3tl.predefined_problems.test_data import generate_fake_data


for input_format in ['gen_dict_tuple', 'dict', 'gen_list_tuple', 'gen_dict', 'dict_tuple']:
    fake_data = generate_fake_data(output_format=input_format)
    if input_format == 'dict_tuple':
        inp, lab = fake_data
        inp = pd.DataFrame(inp).to_dict('recrods')
        fake_data = zip(inp, lab)
    data_iter = convert_legacy_output(fake_data)
    # print(next(data_iter))
    # # print(fake_data)
    if input_format in ['gen_dict_tuple']:
        assert list(next(data_iter).keys()) == [
            'inputs_text', 'inputs_array', 'inputs_cate', 'inputs_cate_modal_type', 'inputs_cate_modal_info', 'labels']

    if input_format == 'gen_dict':
        assert list(next(data_iter).keys()) == ['inputs_record_id', 'inputs_text', 'inputs_array',
                                                'inputs_cate', 'inputs_cate_modal_type', 'inputs_cate_modal_info', 'labels']


In [None]:
# export

def none_generator(length: int = None) -> Generator[None, None, None]:
    if length is None:
        while True:
            yield None
    else:
        for _ in range(length):
            yield None


def convert_data_to_features(problem: str, data_iter: Iterable, params: Params, label_encoder: Any, tokenizer: Any, mode=TRAIN) -> Iterable[dict]:

    if mode != PREDICT:
        problem_type = params.problem_type[problem]

        # whether this problem is sequential labeling
        # for sequential labeling, targets needs to align with any
        # change of inputs
        is_seq = problem_type in ['seq_tag']
    else:
        problem_type = 'cls'
        is_seq = False

    part_fn = partial(create_multimodal_bert_features, problem=problem,
                      label_encoder=label_encoder,
                      params=params,
                      tokenizer=tokenizer,
                      mode=mode,
                      problem_type=problem_type,
                      is_seq=is_seq)
    preprocess_buffer = params.preprocess_buffer
    data_buffer_list = []
    num_cpus = params.num_cpus if params.num_cpus > 0 else num_cpus()
    # no easy fix for prediction in multiprocessing
    # phase is not shared between processes
    num_cpus = 1 if mode == PREDICT else num_cpus
    for data_buffer_list in chunked(data_iter, chunk_sz=preprocess_buffer):
        per_cpu_chunk = listify(chunked(data_buffer_list, n_chunks=num_cpus))
        res_gen = Parallel(num_cpus)(delayed(part_fn)(example_list=d_list)
                                     for d_list in per_cpu_chunk)
        for d_list in res_gen:
            for d in d_list:
                yield d


def convert_data_to_features_pyspark(
        problem: str, dataframe, params: Params, label_encoder: Any, tokenizer: Any, mode=TRAIN):

    # whether this problem is sequential labeling
    # for sequential labeling, targets needs to align with any
    # change of inputs
    from copy import deepcopy

    params_here = deepcopy(params)
    del params_here.read_data_fn

    params.num_cpus = 1

    dataframe = dataframe.mapPartitions(lambda x: convert_data_to_features(
        problem=problem, data_iter=x, params=params_here, tokenizer=tokenizer, label_encoder=label_encoder, mode=mode))

    return dataframe

def check_if_le_created(problem: str, params: Params):

    try:
        le_called: bool = params.get_problem_info(problem=problem, info_name='label_encoder_called')
        if not le_called:
            raise ValueError('If your preprocessing function returns'
                    ' a generator or pyspark RDD, you have to call `m3tl.utils.get_or_make_label_encoder` manually. \n'
                    'If you\'re implementing custom get or make label encoder fn, please specify '
                    'num_classes. Example: \n'
                    'params.set_problem_info(problem=problem, info_name="num_classes", info=100)'.format(problem))
    except KeyError:
        KeyError('If your preprocessing function returns'
                    ' a generator or pyspark RDD, you have to call `m3tl.utils.get_or_make_label_encoder` manually. \n'
                    'If you\'re implementing custom get or make label encoder fn, please specify '
                    'num_classes. Example: \n'
                    'params.set_problem_info(problem=problem, info_name="num_classes", info=100)'.format(problem))

## Decorator

In [None]:
# export
def preprocessing_fn(func: Callable):
    """Usually used as a decorator.

    The input and output signature of decorated function should be:
    func(params: m3tl.Params,
         mode: str) -> Union[Generator[X, y], Tuple[List[X], List[y]]]

    Where X can be:
    - Dicitionary of 'a' and 'b' texts: {'a': 'a test', 'b': 'b test'}
    - Text: 'a test'
    - Dicitionary of modalities: {'text': 'a test', 'image': np.array([1,2,3])}

    Where y can be:
    - Text or scalar: 'label_a'
    - List of text or scalar: ['label_a', 'label_a1'] (for seq2seq and seq_tag)

    This decorator will do the following things:
    - load tokenizer
    - call func, save as example_list
    - create label_encoder and count the number of rows of example_list
    - create bert features from example_list and write tfrecord

    Args:
        func (Callable): preprocessing function for problem
    """
    def wrapper(params, mode, get_data_num=False, write_tfrecord=True):
        problem = func.__name__

        tokenizer = load_transformer_tokenizer(
            params.transformer_tokenizer_name, params.transformer_tokenizer_loading)

        # proc func can return one of the following types:
        # - Generator
        # - Tuple[list] or list
        # - pyspark RDD
        example_list = func(params, mode)

        if isinstance(example_list, tuple) or isinstance(example_list, list):
            try:
                inputs_list, target_list = example_list
            except ValueError:
                inputs_list = example_list
                target_list = none_generator(len(inputs_list))

            if len(inputs_list) == 0:
                raise ValueError(
                    'problem {} preproc fn returns empty data'.format(problem))

            # ugly handling
            if isinstance(inputs_list, dict):
                inputs_list = pd.DataFrame(inputs_list).to_dict('records')

            example_list = zip(inputs_list, target_list)
            example_list = convert_legacy_output(example_list)

            if mode != PREDICT:
                label_encoder = get_or_make_label_encoder(
                    params, problem=problem, mode=mode, label_list=target_list)
            else:
                label_encoder = None

            return convert_data_to_features(
                problem=problem,
                data_iter=example_list,
                params=params,
                label_encoder=label_encoder,
                tokenizer=tokenizer,
                mode=mode
            )
        elif isinstance(example_list, Iterable):
            # trigger making label encoder
            try:
                next(example_list)
            except StopIteration:
                raise StopIteration(
                    'problem {} preproc fn returns empty data'.format(problem))

            example_list = func(params, mode)
            example_list = convert_legacy_output(example_list)

            # create label encoder
            if mode != PREDICT:
                check_if_le_created(problem, params)
                label_encoder = get_or_make_label_encoder(
                    params, problem=problem, mode=mode, label_list=[], overwrite=False)
            else:
                label_encoder = None

            return convert_data_to_features(
                problem=problem,
                data_iter=example_list,
                params=params,
                label_encoder=label_encoder,
                tokenizer=tokenizer,
                mode=mode
            )
        else:
            try:
                from pyspark import RDD
            except ImportError:
                raise ImportError(
                    "pyspark is not installed, in this case, preproc "
                    "function should return a generator, a tuple or a list.")

            if not isinstance(example_list, RDD):
                raise ValueError("preproc function should return a generator, a tuple, "
                "a list or a pyspark RDD, got {} from problem {}".format(
                    type(example_list), problem))

            set_is_pyspark(True)
            if params.pyspark_output_path is None:
                raise ValueError(
                    "preproc function of {} returns RDD but "
                    "params.pyspark_output_path is not set.".format(problem))
                    
            if mode != PREDICT:
                check_if_le_created(problem, params)
                label_encoder = get_or_make_label_encoder(
                    params, problem=problem, mode=mode, label_list=[], overwrite=False)
            else:
                label_encoder = None

            return convert_data_to_features_pyspark(
                problem=problem,
                dataframe=example_list,
                params=params,
                label_encoder=label_encoder,
                tokenizer=tokenizer,
                mode=mode
            )
    return wrapper


## User-Defined Preprocessing Function

The user-defined preprocessing function should return two elements: features and targets, except for `pretrain` problem type.

For features and targets, it can be one of the following format:
- tuple of list
- generator of tuple

Please note that if preprocessing function returns generator of tuple, then corresponding problem cannot be chained using `&`.

In [None]:
# hide
import m3tl
from m3tl.params import Params
from typing import Tuple
import shutil
import tempfile
import numpy as np
import os


In [None]:
# hide

# setup params for testing
from m3tl.test_base import TestBase
tb = TestBase()
params = tb.params


2021-06-22 20:19:16.587 | INFO     | m3tl.base_params:register_multiple_problems:538 - Adding new problem weibo_fake_ner, problem type: seq_tag
2021-06-22 20:19:16.588 | INFO     | m3tl.base_params:register_multiple_problems:538 - Adding new problem weibo_fake_multi_cls, problem type: multi_cls
2021-06-22 20:19:16.588 | INFO     | m3tl.base_params:register_multiple_problems:538 - Adding new problem weibo_fake_cls, problem type: cls
2021-06-22 20:19:16.589 | INFO     | m3tl.base_params:register_multiple_problems:538 - Adding new problem weibo_masklm, problem type: masklm
2021-06-22 20:19:16.589 | INFO     | m3tl.base_params:register_multiple_problems:538 - Adding new problem weibo_fake_regression, problem type: regression
2021-06-22 20:19:16.590 | INFO     | m3tl.base_params:register_multiple_problems:538 - Adding new problem weibo_fake_vector_fit, problem type: vector_fit
2021-06-22 20:19:16.590 | INFO     | m3tl.base_params:register_multiple_problems:538 - Adding new problem weibo_pre

In [None]:
# hide
from m3tl.special_tokens import TRAIN

for problem_name, fn in params.read_data_fn.items():
    print(problem_name)
    print(next(fn(params, TRAIN)).keys())


weibo_fake_ner
dict_keys(['text_input_ids', 'text_mask', 'text_segment_ids', 'weibo_fake_ner_label_ids'])
weibo_fake_multi_cls
dict_keys(['text_input_ids', 'text_mask', 'text_segment_ids', 'weibo_fake_multi_cls_label_ids', 'array_input_ids', 'array_mask', 'array_segment_ids', 'cate_input_ids', 'cate_mask', 'cate_segment_ids'])
weibo_fake_cls
dict_keys(['text_input_ids', 'text_mask', 'text_segment_ids', 'weibo_fake_cls_label_ids'])
weibo_masklm
dict_keys(['text_input_ids', 'text_mask', 'text_segment_ids', 'masked_lm_positions', 'masked_lm_ids', 'masked_lm_weights'])
weibo_fake_regression
dict_keys(['record_id', 'text_input_ids', 'text_mask', 'text_segment_ids', 'weibo_fake_regression_label_ids', 'array_input_ids', 'array_mask', 'array_segment_ids', 'cate_input_ids', 'cate_mask', 'cate_segment_ids'])
weibo_fake_vector_fit
dict_keys(['text_input_ids', 'text_mask', 'text_segment_ids', 'weibo_fake_vector_fit_label_ids'])
weibo_premask_mlm
dict_keys(['text_input_ids', 'text_mask', 'text_segm

### Tuple of List

#### Single Modal


In [None]:
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate singe modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = ['this is a toy input' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = ['this is a toy input for test' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    return toy_input, toy_target


In [None]:
# hide
import pandas as pd
from pyspark import RDD
from copy import copy

def preproc_dec_test(fn=toy_cls, run_train=True):
    params.register_problem(problem_name='toy_cls',
                            problem_type='cls', processing_fn=fn)
    copy_params = copy(params)
    copy_params.assign_problem('toy_cls')
    if run_train:
        res = fn(params=copy_params, mode=TRAIN)
        if isinstance(res, RDD):
            print(res.take(1))
            return
        next(res)
    fn(params=copy_params, mode=PREDICT)



preproc_dec_test()




In [None]:
# hide
# predict multiprocessing test
params.num_cpus = 2


@preprocessing_fn
def wrapper(params, mode):
    for t in [{'text': 'this is a toy input',
               'image': np.random.uniform(size=(16))} for _ in range(10)]:
        yield t

preproc_dec_test(wrapper, run_train=False)



#### Multi-modal

In [None]:
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate multi-modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = [{'text': 'this is a toy input',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = [{'text': 'this is a toy input for test',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]

    return toy_input, toy_target


In [None]:
# hide
preproc_dec_test()




#### A, B Token Multi-modal

TODO: Implement this. Not working yet.

In [None]:
# hide
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate A, B token multi-modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = [
            {
                'a': {
                    'text': 'this is a toy input',
                    'image': np.random.uniform(size=(16))
                },
                'b': {
                    'text': 'this is a toy input',
                    'image': np.random.uniform(size=(16))
                }
            } for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = [
            {
                'a': {
                    'text': 'this is a toy input for test',
                    'image': np.random.uniform(size=(16))
                },
                'b': {
                    'text': 'this is a toy input for test',
                    'image': np.random.uniform(size=(16))
                }
            } for _ in range(10)]
        toy_target = ['a' for _ in range(10)]

    return toy_input, toy_target


In [None]:
# # hide
# params.register_problem(problem_name='toy_cls', problem_type='cls', processing_fn=toy_cls)
# assert (10, 1)==toy_cls(params=params, mode=m3tl.TRAIN, get_data_num=True, write_tfrecord=False)

# shutil.rmtree(os.path.join(params.tmp_file_dir, 'toy_cls'))
# toy_cls(params=params, mode=m3tl.TRAIN, get_data_num=False, write_tfrecord=True)
# assert os.path.exists(os.path.join(params.tmp_file_dir, 'toy_cls', 'train_feature_desc.json'))


### Generator of Tuple

#### Single Modal

In [None]:
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate singe modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = ['this is a toy input' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = ['this is a toy input for test' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    for i, t in zip(toy_input, toy_target):
        yield i, t


In [None]:
# hide
preproc_dec_test()




#### Multi-modal

In [None]:
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate multi-modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = [{'text': 'this is a toy input',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = [{'text': 'this is a toy input for test',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    for i, t in zip(toy_input, toy_target):
        yield i, t


In [None]:
# hide
preproc_dec_test()




### Pyspark dataframe

#### single modal

In [None]:
# hide
import pandas as pd
from pyspark_crud.core import set_globals, get_globals

set_globals()
_, _, sc, _ = get_globals()


@preprocessing_fn
def toy_cls(params: Params, mode: str) -> RDD:
    if mode == m3tl.TRAIN:
        d = {
            'inputs': ['this is a toy input' for _ in range(10)],
            'labels': ['a' for _ in range(10)]
        }
    else:
        d = {
            'inputs': ['this is a toy input for test' for _ in range(10)],
            'labels': ['a' for _ in range(10)]
        }
    # transform d to records shape
    d = pd.DataFrame(d).to_dict('records')
    rdd = sc.parallelize(d)
    return rdd


params.pyspark_output_path = tempfile.mkdtemp()
preproc_dec_test()




#### multimodal

In [None]:
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> RDD:
    get_or_make_label_encoder(params=params, problem='toy_cls', label_list=['a'], mode=mode)
    if mode == m3tl.TRAIN:
        d = {
            'inputs_text': ['this is a toy input' for _ in range(10)],
            'inputs_image': [np.random.uniform(size=(16)).tolist() for _ in range(10)],
            'labels': ['a' for _ in range(10)]
        }
    else:
        d = {
            'inputs_text': ['this is a toy input test' for _ in range(10)],
            'inputs_image': [np.random.uniform(size=(16)).tolist() for _ in range(10)],
            'labels': ['a' for _ in range(10)]
        }
    d = pd.DataFrame(d).to_dict('records')
    rdd = sc.parallelize(d)
    return rdd


preproc_dec_test()


