In [None]:
# default_exp input_fn


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Function to Create Datasets

Function to create datasets to train, eval and predict.

## Train and Eval Dataset
We can get train and eval dataset by passing a problem assigned params and mode.

In [None]:
# export

from functools import partial
from itertools import tee
from typing import List, Union, Dict

import tensorflow as tf

from bert_multitask_learning.bert_preprocessing.create_bert_features import (
    create_bert_features_generator, create_multimodal_bert_features_generator)
from bert_multitask_learning.params import BaseParams
from bert_multitask_learning.read_write_tfrecord import read_tfrecord, write_tfrecord
from bert_multitask_learning.special_tokens import PREDICT, TRAIN
from bert_multitask_learning.utils import infer_shape_and_type_from_dict, load_transformer_tokenizer


def element_length_func(yield_dict: Dict[str, tf.Tensor]):  # pragma: no cover
    max_length = tf.shape(yield_dict['input_ids'])[0]
    return max_length


def train_eval_input_fn(params: BaseParams, mode=TRAIN) -> tf.data.Dataset:
    '''
    This function will write and read tf record for training
    and evaluation.

    Arguments:
        params {Params} -- Params objects

    Keyword Arguments:
        mode {str} -- ModeKeys (default: {TRAIN})

    Returns:
        tf Dataset -- Tensorflow dataset
    '''
    write_tfrecord(params=params)

    dataset_dict = read_tfrecord(params=params, mode=mode)

    # make sure the order is correct
    dataset_dict_keys = list(dataset_dict.keys())
    dataset_list = [dataset_dict[key] for key in dataset_dict_keys]
    weight_list = [params.problem_sampling_weight_dict[key]
                   for key in dataset_dict_keys]

    logger = tf.get_logger()
    logger.info('sampling weights: ')
    for problem_chunk_name, weight in params.problem_sampling_weight_dict.items():
        logger.info('{0}: {1}'.format(problem_chunk_name, weight))

    dataset = tf.data.experimental.sample_from_datasets(
        datasets=dataset_list, weights=weight_list)

    if mode == TRAIN:
        dataset = dataset.shuffle(params.shuffle_buffer)

    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    if params.dynamic_padding:
        dataset = dataset.apply(
            tf.data.experimental.bucket_by_sequence_length(
                element_length_func=element_length_func,
                bucket_batch_sizes=params.bucket_batch_sizes,
                bucket_boundaries=params.bucket_boundaries
            ))
    else:
        first_example = next(dataset.as_numpy_iterator())
        output_shapes, _ = infer_shape_and_type_from_dict(first_example)

        if mode == TRAIN:
            dataset = dataset.padded_batch(params.batch_size, output_shapes)
        else:
            dataset = dataset.padded_batch(params.batch_size*2, output_shapes)

    return dataset






In [None]:
# hide
from bert_multitask_learning.test_base import TestBase
import bert_multitask_learning
import shutil
import numpy as np
test_base = TestBase()
test_base.params.assign_problem(
    'weibo_fake_ner&weibo_fake_cls|weibo_fake_multi_cls|weibo_masklm')
params = test_base.params


Adding new problem weibo_ner, problem type: seq_tag
Adding new problem weibo_cws, problem type: seq_tag
Adding new problem weibo_fake_multi_cls, problem type: multi_cls
Adding new problem weibo_fake_cls, problem type: cls
Adding new problem weibo_masklm, problem type: masklm


In [None]:

train_dataset = bert_multitask_learning.train_eval_input_fn(
    params=params, mode=bert_multitask_learning.TRAIN)
eval_dataset = bert_multitask_learning.train_eval_input_fn(
    params=params, mode=bert_multitask_learning.EVAL
)

_ = next(train_dataset.as_numpy_iterator())
_ = next(eval_dataset.as_numpy_iterator())


, 1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
INFO:tensorflow:weibo_fake_multi_cls_label_ids: [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
INFO:tensorflow:image_input: [[0.30820696 0.32793963 0.5366956  0.38816917 0.40036218 0.3959411
  0.04921183 0.05721554 0.53213454 0.78395215]
 [0.63024512 0.37783302 0.24188715 0.39396367 0.74710608 0.21944181
  0.9126956  0.591
INFO:tensorflow:image_mask: [1, 1, 1, 1, 1]
INFO:tensorflow:image_segment_ids: [0 0 0 0 0]
INFO:tensorflow:text: ['延', '参', '法', '师', '品', '味', '人', '生', '如', '同', '走', '进', '一', '片', '山', '水', '，', '静', '静', '的', '呼', '吸', '，', '安', '静', '的', '欣', '赏', '，', '这', '就', '是', '生', '活', '。']
INFO:tensorflow:image: [[0.13549992 0.80370666 0.65313431 0.93012199 0.72213015 0.8667986
  0.63323659 0.32622537 0.45220408 0.10011452]
 [0.69814168 0.75051269 0.09380045 0.2214851  0.03177556 0.06417007
  0.86614259 0.029
INFO:tensorflow:input_ids: [101, 2454, 1346, 3791, 2360, 1501, 1456, 782, 

In [None]:
# hide
# dynamic_padding disabled
# have to remove existing tfrecord
shutil.rmtree(test_base.tmpfiledir)
test_base.params.dynamic_padding = False
train_dataset = bert_multitask_learning.train_eval_input_fn(
    params=test_base.params, mode=bert_multitask_learning.TRAIN)
eval_dataset = bert_multitask_learning.train_eval_input_fn(
    params=test_base.params, mode=bert_multitask_learning.EVAL
)

_ = next(train_dataset.as_numpy_iterator())
_ = next(eval_dataset.as_numpy_iterator())

image: [[0.74383737 0.99553567 0.98171795 0.76857011 0.95838343 0.18724558
  0.66824515 0.22688816 0.15239881 0.17317303]
 [0.00720543 0.98959837 0.07113386 0.15324625 0.97135101 0.03987707
  0.43433203 0.13
INFO:tensorflow:input_ids: [101, 2454, 1346, 3791, 2360, 1501, 1456, 782, 4495, 1963, 1398, 6624, 6822, 671, 4275, 2255, 3717, 8024, 7474, 7474, 4638, 1461, 1429, 8024, 2128, 7474, 4638, 3615, 6605, 8024, 6821, 2218, 3221, 4495
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
INFO:tensorflow:weibo_fake_multi_cls_label_ids: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
INFO:tensorflow:image_input: [[0.74383737 0.99553567 0.98171795 0.76857011 0.95838343 0.18724558
  0.66824515 0.22688816 0.15239881 0.17317303]
 [0.00720543 0.98959837 0.07113386 0.15

## Predict Dataset

We can create a predict dataset by passing list/generator of inputs and problem assigned params.

In [None]:
# export
def predict_input_fn(input_file_or_list: Union[str, List[str]],
                     params: BaseParams,
                     mode=PREDICT,
                     labels_in_input=False) -> tf.data.Dataset:
    '''Input function that takes a file path or list of string and 
    convert it to tf.dataset

    Example:
        predict_fn = lambda: predict_input_fn('test.txt', params)
        pred = estimator.predict(predict_fn)

    Arguments:
        input_file_or_list {str or list} -- file path or list of string
        params {Params} -- Params object

    Keyword Arguments:
        mode {str} -- ModeKeys (default: {PREDICT})

    Returns:
        tf dataset -- tf dataset
    '''

    # if is string, treat it as path to file
    if isinstance(input_file_or_list, str):
        inputs = open(input_file_or_list, 'r', encoding='utf8')
    else:
        inputs = input_file_or_list

    tmp_iter, inputs = tee(inputs, 2)
    first_element = next(tmp_iter)

    if labels_in_input:
        first_element, _ = first_element

    tokenizer = load_transformer_tokenizer(
        params.transformer_tokenizer_name, params.transformer_tokenizer_loading)
    if isinstance(first_element, dict) and 'a' not in first_element:
        part_fn = partial(create_multimodal_bert_features_generator, problem='',
                          label_encoder=None,
                          params=params,
                          tokenizer=tokenizer,
                          mode=mode,
                          problem_type='cls',
                          is_seq=False)
    else:
        part_fn = partial(create_bert_features_generator, problem='',
                          label_encoder=None,
                          params=params,
                          tokenizer=tokenizer,
                          mode=mode,
                          problem_type='cls',
                          is_seq=False)
    first_dict = next(part_fn(example_list=tmp_iter))

    def gen():
        for d in part_fn(example_list=inputs):
            yield d
    output_shapes, output_type = infer_shape_and_type_from_dict(first_dict)
    dataset = tf.data.Dataset.from_generator(
        gen, output_types=output_type, output_shapes=output_shapes)

    dataset = dataset.padded_batch(
        params.batch_size,
        output_shapes
    )
    # dataset = dataset.batch(config.batch_size*2)

    return dataset

### Single modal inputs

In [None]:

single_dataset = predict_input_fn(
    ['this is a test']*5, params=params)
first_batch = next(single_dataset.as_numpy_iterator())
assert first_batch['input_ids'].tolist()[0] == [
                    101,  8554,  8310,   143, 10060,   102]



INFO:tensorflow:this is a test
INFO:tensorflow:input_ids: [101, 8554, 8310, 143, 10060, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0]
INFO:tensorflow:this is a test
INFO:tensorflow:input_ids: [101, 8554, 8310, 143, 10060, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0]
INFO:tensorflow:this is a test
INFO:tensorflow:input_ids: [101, 8554, 8310, 143, 10060, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0]
INFO:tensorflow:this is a test
INFO:tensorflow:input_ids: [101, 8554, 8310, 143, 10060, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0]
INFO:tensorflow:this is a test
INFO:tensorflow:input_ids: [101, 8554, 8310, 143, 10060, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0]
INFO:tensorflow:this is a test
INFO:tensorflow:input_ids: [1

### Multi-modal inputs

In [None]:
# multi modal input
mm_input = [{'text': 'this is a test',
                'image': np.zeros(shape=(5, 10), dtype='float32')}] * 5
mm_dataset = bert_multitask_learning.predict_input_fn(
    mm_input, params=params)
first_batch = next(mm_dataset.as_numpy_iterator())
assert first_batch['input_ids'].tolist()[0] == [
                    101,  8554,  8310,   143, 10060,   102]
assert first_batch['image_input'].tolist()[0] == np.zeros(
    shape=(5, 10), dtype='float32').tolist()

INFO:tensorflow:text: this is a test
INFO:tensorflow:image: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
INFO:tensorflow:input_ids: [101, 8554, 8310, 143, 10060, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0]
INFO:tensorflow:image_input: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
INFO:tensorflow:image_mask: [1, 1, 1, 1, 1]
INFO:tensorflow:image_segment_ids: [0 0 0 0 0]
INFO:tensorflow:text: this is a test
INFO:tensorflow:image: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
INFO:tensorflow:input_ids: [101, 8554, 8310, 143, 10060, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1,