In [None]:
# default_exp params
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import show_doc
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Params

`Params` is the major object to control the whole modeling process. It is supposed to be accessable anywhere. 

In [None]:
# export

from m3tl.base_params import BaseParams
from m3tl.embedding_layer.base import (DefaultMultimodalEmbedding,
                                       DuplicateAugMultimodalEmbedding)
from m3tl.loss_strategy.base import SumLossCombination
from m3tl.mtl_model.mmoe import MMoE
from m3tl.problem_types import cls as problem_type_cls
from m3tl.problem_types import (contrastive_learning, masklm, multi_cls,
                                premask_mlm, pretrain, regression, seq_tag,
                                vector_fit)


class Params(BaseParams):
    def __init__(self):
        super().__init__()
        # register pre-defined problem types
        self.register_problem_type(problem_type='cls',
                                   top_layer=problem_type_cls.Classification,
                                   label_handling_fn=problem_type_cls.cls_label_handling_fn,
                                   get_or_make_label_encoder_fn=problem_type_cls.cls_get_or_make_label_encoder_fn,
                                   description='Classification')
        self.register_problem_type(problem_type='multi_cls',
                                   top_layer=multi_cls.MultiLabelClassification,
                                   label_handling_fn=multi_cls.multi_cls_label_handling_fn,
                                   get_or_make_label_encoder_fn=multi_cls.multi_cls_get_or_make_label_encoder_fn,
                                   description='Multi-Label Classification')
        self.register_problem_type(problem_type='seq_tag',
                                   top_layer=seq_tag.SequenceLabel,
                                   label_handling_fn=seq_tag.seq_tag_label_handling_fn,
                                   get_or_make_label_encoder_fn=seq_tag.seq_tag_get_or_make_label_encoder_fn,
                                   description='Sequence Labeling')
        self.register_problem_type(problem_type='masklm',
                                   top_layer=masklm.MaskLM,
                                   label_handling_fn=masklm.masklm_label_handling_fn,
                                   get_or_make_label_encoder_fn=masklm.masklm_get_or_make_label_encoder_fn,
                                   description='Masked Language Model')
        self.register_problem_type(problem_type='pretrain',
                                   top_layer=pretrain.PreTrain,
                                   label_handling_fn=pretrain.pretrain_label_handling_fn,
                                   get_or_make_label_encoder_fn=pretrain.pretrain_get_or_make_label_encoder_fn,
                                   description='NSP+MLM(Deprecated)')
        self.register_problem_type(problem_type='regression',
                                   top_layer=regression.Regression,
                                   label_handling_fn=regression.regression_label_handling_fn,
                                   get_or_make_label_encoder_fn=regression.regression_get_or_make_label_encoder_fn,
                                   description='Regression')
        self.register_problem_type(
            problem_type='vector_fit',
            top_layer=vector_fit.VectorFit,
            label_handling_fn=vector_fit.vector_fit_label_handling_fn,
            get_or_make_label_encoder_fn=vector_fit.vector_fit_get_or_make_label_encoder_fn,
            description='Vector Fitting')
        self.register_problem_type(
            problem_type='premask_mlm',
            top_layer=premask_mlm.PreMaskMLM,
            label_handling_fn=premask_mlm.premask_mlm_label_handling_fn,
            get_or_make_label_encoder_fn=premask_mlm.premask_mlm_get_or_make_label_encoder_fn,
            description='Pre-masked Masked Language Model'
        )
        self.register_problem_type(
            problem_type='contrastive_learning',
            top_layer=contrastive_learning.ContrastiveLearning,
            label_handling_fn=contrastive_learning.contrastive_learning_label_handling_fn,
            get_or_make_label_encoder_fn=contrastive_learning.contrastive_learning_get_or_make_label_encoder_fn,
            description='Contrastive Learning'
        )

        self.register_mtl_model(
            'mmoe', MMoE, include_top=False, extra_info='MMoE')
        self.register_loss_combination_strategy('sum', SumLossCombination)
        self.register_embedding_layer(
            'duplicate_data_augmentation_embedding', DuplicateAugMultimodalEmbedding)
        self.register_embedding_layer(
            'default_embedding', DefaultMultimodalEmbedding)

        self.assign_loss_combination_strategy('sum')
        self.assign_data_sampling_strategy()
        self.assign_embedding_layer('default_embedding')


  return torch._C._cuda_getDeviceCount() > 0


In [None]:
# hide
from m3tl.test_base import TestBase
tb = TestBase()
params = tb.params
tmp_model_dir = tb.tmpckptdir



Adding new problem weibo_fake_ner, problem type: seq_tag
Adding new problem weibo_cws, problem type: seq_tag
Adding new problem weibo_fake_multi_cls, problem type: multi_cls
Adding new problem weibo_fake_cls, problem type: cls
Adding new problem weibo_masklm, problem type: masklm
Adding new problem weibo_pretrain, problem type: pretrain
Adding new problem weibo_fake_regression, problem type: regression
Adding new problem weibo_fake_vector_fit, problem type: vector_fit
Adding new problem weibo_premask_mlm, problem type: premask_mlm


## Add Problems


In [None]:
# hide
# define a simple preprocessing function
import m3tl
from m3tl import preprocessing_fn
@preprocessing_fn
def toy_cls(params: Params, mode: str):
    "Simple example to demonstrate singe modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = ['this is a toy input' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = ['this is a toy input for test' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    return toy_input, toy_target

@preprocessing_fn
def toy_seq_tag(params: Params, mode: str):
    "Simple example to demonstrate singe modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = ['this is a toy input'.split(' ') for _ in range(10)]
        toy_target = [['a', 'b', 'c', 'd', 'e'] for _ in range(10)]
    else:
        toy_input = ['this is a toy input for test'.split(' ') for _ in range(10)]
        toy_target = [['a', 'b', 'c', 'd', 'e', 'e', 'e'] for _ in range(10)]
    return toy_input, toy_target

In [None]:
show_doc(BaseParams.register_problem)

<h4 id="BaseParams.register_problem" class="doc_header"><code>BaseParams.register_problem</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L474" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.register_problem</code>(**`problem_name`**:`str`, **`problem_type`**=*`'cls'`*, **`processing_fn`**:`Callable`=*`None`*)



Add problems.

Args:
- problem_name (str): problem name.
- problem_type (str, optional): One of the following problem types:
['cls', 'seq_tag', 'seq2seq_tag', 'seq2seq_text', 'multi_cls', 'pretrain'].
Defaults to 'cls'.
- processing_fn (Callable, optional): preprocessing function. Defaults to None.

Raises:
- ValueError: unexpected problem_type

In [None]:
params.register_problem(problem_name='toy_cls', problem_type='cls', processing_fn=toy_cls)
params.register_problem(problem_name='toy_seq_tag', problem_type='seq_tag', processing_fn=toy_seq_tag)

In [None]:
show_doc(BaseParams.register_multiple_problems)

<h4 id="BaseParams.register_multiple_problems" class="doc_header"><code>BaseParams.register_multiple_problems</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L484" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.register_multiple_problems</code>(**`problem_type_dict`**:`Dict`\[`str`, `str`\], **`processing_fn_dict`**:`Dict`\[`str`, `Callable`\]=*`None`*)



Add multiple problems.

processing_fn_dict is optional, if it's not provided, processing fn will be set as None.

Args:
- problem_type_dict (Dict[str, str]): problem type dict
- processing_fn_dict (Dict[str, Callable], optional): problem type fn. Defaults to None.

In [None]:
# make dict and add problems to params
problem_type_dict = {'toy_cls': 'cls', 'toy_seq_tag': 'seq_tag'}
processing_fn_dict = {'toy_cls': toy_cls, 'toy_seq_tag': toy_seq_tag}
params.register_multiple_problems(problem_type_dict=problem_type_dict, processing_fn_dict=processing_fn_dict)

Adding new problem toy_cls, problem type: cls
Adding new problem toy_seq_tag, problem type: seq_tag


## Assign Problems

In [None]:
show_doc(BaseParams.assign_problem)

<h4 id="BaseParams.assign_problem" class="doc_header"><code>BaseParams.assign_problem</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L545" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.assign_problem</code>(**`flag_string`**:`str`, **`gpu`**=*`2`*, **`base_dir`**:`str`=*`None`*, **`dir_name`**:`str`=*`None`*, **`predicting`**=*`False`*)



Assign the actual run problem to param. This function will
do the following things:

1. parse the flag string to form the run_problem_list
2. create checkpoint saving path
3. calculate total number of training data and training steps
4. scale learning rate with the number of gpu linearly

Arguments:
- flag_string {str} -- run problem string
- example: cws|POS|weibo_ner&weibo_cws

Keyword Arguments:
- gpu {int} -- number of gpu use for training, this will affect the training steps and learning rate (default: {2})
- base_dir {str} -- base dir for ckpt, if None, then "models" is assigned (default: {None})
- dir_name {str} -- dir name for ckpt, if None, will be created automatically (default: {None})
- predicting {bool} -- whether is predicting

In [None]:
params.assign_problem(flag_string='toy_seq_tag|toy_cls', base_dir=tmp_model_dir)
assert params.problem_assigned



After problem assigned, the model path should be created with tokenizers, label encoder files in it.

In [None]:
# hide
# assert os.listdir(params.ckpt_dir) == ['data_info.json',
#  'tokenizer',
#  'toy_cls_label_encoder.pkl',
#  'toy_seq_tag_label_encoder.pkl',
#  'bert_config']

## Register new problem type

You can also implement your own problem type. Essentially, a problem type has:
- name
- top layer
- label handling function
- label encoder creating function

Here we register a vector fitting(vector annealing) problem type as an example.

Note: This is originally designed as an internal API for development. So it's not user-friendly.

In [None]:
show_doc(BaseParams.register_problem_type)

<h4 id="BaseParams.register_problem_type" class="doc_header"><code>BaseParams.register_problem_type</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L445" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.register_problem_type</code>(**`problem_type`**:`str`, **`top_layer`**:`Model`=*`None`*, **`label_handling_fn`**:`Callable`=*`None`*, **`get_or_make_label_encoder_fn`**:`Callable`=*`None`*, **`inherit_from`**:`str`=*`None`*)



API to register a new problem type

Args:
- problem_type: string, problem type name
- top_layer: a keras model with some specific reqirements
- label_handling_fn: function to convert labels to label ids
- get_or_make_label_encoder_fn: function to create label encoder, num_classes has to be specified here

In [None]:
from m3tl.problem_types.utils import BaseTop
from m3tl.problem_types.utils import empty_tensor_handling_loss, nan_loss_handling
import tensorflow as tf
from typing import Tuple, Dict
import numpy as np
# top layer
class VectorFit(BaseTop):
    def __init__(self, params: Params, problem_name: str) -> None:
        super(VectorFit, self).__init__(
            params=params, problem_name=problem_name)
        self.num_classes = self.params.num_classes[problem_name]
        self.dense = tf.keras.layers.Dense(self.num_classes)

    def call(self, inputs: Tuple[Dict], mode: str):
        feature, hidden_feature = inputs
        pooled_hidden = hidden_feature['pooled']

        logits = self.dense(pooled_hidden)
        if mode != tf.estimator.ModeKeys.PREDICT:
            # this is the same as the label_id returned by vector_fit_label_handling_fn
            label = feature['{}_label_ids'.format(self.problem_name)]

            loss = empty_tensor_handling_loss(label, logits, cosine_wrapper)
            loss = nan_loss_handling(loss)
            self.add_loss(loss)

            self.add_metric(tf.math.negative(
                loss), name='{}_cos_sim'.format(self.problem_name), aggregation='mean')
        return logits

# label handling fn
def vector_fit_label_handling_fn(target, label_encoder=None, tokenizer=None, decoding_length=None):
    # don't need to encoder labels, return array directly
    # return label_id and label mask
    label_id = np.array(target, dtype='float32')
    return label_id, None

# make label encoder
def vector_fit_get_or_make_label_encoder_fn(params: Params, problem, mode, label_list):
    # don't need to make label encoder here
    # set params num_classes for this problem
    label_array = np.array(label_list)
    params.num_classes[problem] = label_array.shape[-1]
    return None

params.register_problem_type(problem_type='vectorfit', top_layer=VectorFit, label_handling_fn=vector_fit_label_handling_fn, get_or_make_label_encoder_fn=vector_fit_get_or_make_label_encoder_fn)

## Utils

In [None]:
show_doc(BaseParams.from_json)

<h4 id="BaseParams.from_json" class="doc_header"><code>BaseParams.from_json</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L205" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.from_json</code>(**`json_path`**:`str`=*`None`*)

Load json file as params.

json_path could not be None if the problem is not assigned to params

Args:
    json_path (str, optional): Path to json file. Defaults to None.

Raises:
    AttributeError

In [None]:
show_doc(BaseParams.to_json)

<h4 id="BaseParams.to_json" class="doc_header"><code>BaseParams.to_json</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L191" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.to_json</code>()

Save the params as json files. Please note that processing_fn is not saved.
        

In [None]:
show_doc(BaseParams.parse_problem_string)

<h4 id="BaseParams.parse_problem_string" class="doc_header"><code>BaseParams.parse_problem_string</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L239" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.parse_problem_string</code>(**`flag_string`**:`str`)



Parse problem string

Arguments: flag_string {str} -- problem string

Returns: list -- problem list

In [None]:
print('chained with |: ', params.parse_problem_string('toy_seq_tag|toy_cls'))
print('chained with &: ', params.parse_problem_string('toy_seq_tag&toy_cls'))

chained with |:  (['toy_cls', 'toy_seq_tag'], [['toy_seq_tag'], ['toy_cls']])
chained with &:  (['toy_cls', 'toy_seq_tag'], [['toy_seq_tag', 'toy_cls']])


In [None]:
show_doc(BaseParams.get_problem_type)

<h4 id="BaseParams.get_problem_type" class="doc_header"><code>BaseParams.get_problem_type</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L411" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.get_problem_type</code>(**`problem`**:`str`)



In [None]:
params.get_problem_type('toy_seq_tag')

'seq_tag'

In [None]:
show_doc(BaseParams.update_train_steps)

<h4 id="BaseParams.update_train_steps" class="doc_header"><code>BaseParams.update_train_steps</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L414" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.update_train_steps</code>(**`train_steps_per_epoch`**:`int`, **`epoch`**:`int`=*`None`*, **`warmup_ratio`**=*`0.1`*)

If the batch_size is dynamic, we have to loop through the tf.data.Dataset
to get the accurate number of training steps. In this case, we need a function to
update the train_steps which will be used to calculate learning rate schedule.

WARNING: updating should be called before the model is compiled!

Args:
    train_steps (int): new number of train_steps

If the batch_size is dynamic, we have to loop through the tf.data.Dataset
to get the accurate number of training steps. In this case, we need a function to
update the train_steps which will be used to calculate learning rate schedule.

WARNING: updating should be called before the model is compiled! 

Args:
- train_steps (int): new number of train_steps

In [None]:

params.update_train_steps(train_steps_per_epoch=100)
print(params.train_steps, params.num_warmup_steps)

1500 150


In [None]:
show_doc(BaseParams.assign_data_sampling_strategy)

<h4 id="BaseParams.assign_data_sampling_strategy" class="doc_header"><code>BaseParams.assign_data_sampling_strategy</code><a href="https://github.com/JayYip/m3tl/tree/master/m3tl/base_params.py#L568" class="source_link" style="float:right">[source]</a></h4>

> <code>BaseParams.assign_data_sampling_strategy</code>(**`sampling_strategy_name`**=*`'data_balanced'`*, **`sampling_strategy_fn`**:`Callable`=*`None`*)



Set data sampling strategy for multi-task learning.

'data_balanced' and 'problem_balanced' is implemented by default.
data_balanced: sampling weight equals to number of rows of that problem chunk.
problem_balanced: sampling weight equals to 1 for every problem chunk.

Args:
- sampling_strategy (str, optional): sampling strategy. Defaults to 'data_balanced'.
- sampling_strategy_fn (Callable, optional): function to create weight dict. Defaults to None.

Raises:
- NotImplementedError: sampling_strategy_fn is not implemented yet
- ValueError: invalid sampling_strategy provided

Returns:
- Dict[str, float]: sampling weight for each problem_chunk

In [None]:
params.assign_data_sampling_strategy(sampling_strategy_name='problem_balanced')