In [None]:
# default_exp utils
%load_ext autoreload
%autoreload 2
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Utils

Utils functions

## Imports

In [None]:
# export
import os
import pickle
import re
from typing import Union
from inspect import getmembers
from collections import defaultdict

from loguru import logger
import logging
import warnings

import numpy as np
import tensorflow as tf
import transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, PreTrainedTokenizer, TFPreTrainedModel
from m3tl.special_tokens import TRAIN, EVAL, PREDICT

In [None]:
# hide
from m3tl.test_base import TestBase
import m3tl
import shutil
import numpy as np
test_base = TestBase()
params = test_base.params

2021-06-05 23:39:46.452 | INFO     | m3tl.base_params:register_multiple_problems:476 - Adding new problem weibo_fake_ner, problem type: seq_tag
2021-06-05 23:39:46.453 | INFO     | m3tl.base_params:register_multiple_problems:476 - Adding new problem weibo_fake_multi_cls, problem type: multi_cls
2021-06-05 23:39:46.453 | INFO     | m3tl.base_params:register_multiple_problems:476 - Adding new problem weibo_fake_cls, problem type: cls
2021-06-05 23:39:46.454 | INFO     | m3tl.base_params:register_multiple_problems:476 - Adding new problem weibo_masklm, problem type: masklm
2021-06-05 23:39:46.454 | INFO     | m3tl.base_params:register_multiple_problems:476 - Adding new problem weibo_fake_regression, problem type: regression
2021-06-05 23:39:46.455 | INFO     | m3tl.base_params:register_multiple_problems:476 - Adding new problem weibo_fake_vector_fit, problem type: vector_fit
2021-06-05 23:39:46.455 | INFO     | m3tl.base_params:register_multiple_problems:476 - Adding new problem weibo_pre

In [None]:
# export
def load_transformer_tokenizer(tokenizer_name: str, load_module_name=None):
    """some tokenizers cannot be loaded using AutoTokenizer.

    this function served as a util function to catch that situation.

    Args:
        tokenizer_name (str): tokenizer name
    """
    if load_module_name:
        tok = getattr(transformers, load_module_name).from_pretrained(
            tokenizer_name)
    else:
        tok = AutoTokenizer.from_pretrained(tokenizer_name)

    return tok

In [None]:
load_transformer_tokenizer(
            'voidful/albert_chinese_tiny', 'BertTokenizer')

PreTrainedTokenizer(name_or_path='voidful/albert_chinese_tiny', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
# export
def load_transformer_config(config_name_or_dict, load_module_name=None):
    """Some models need specify loading module

    Args:
        config_name (str): module name
        load_module_name (str, optional): loading module name. Defaults to None.

    Returns:
        config: config
    """
    if load_module_name:
        load_module = getattr(transformers, load_module_name)
    else:
        load_module = transformers.AutoConfig
    if isinstance(config_name_or_dict, str):
        config = load_module.from_pretrained(
            config_name_or_dict, output_attentions=True, output_hidden_states=True)
    elif isinstance(config_name_or_dict, dict):
        config = load_module.from_dict(
            config_name_or_dict, output_attentions=True, output_hidden_states=True)
    else:
        raise ValueError('config_name_or_dict should be str or dict')
    return config

In [None]:
# load config with name
config = load_transformer_config(
    'bert-base-chinese')
config_dict = config.to_dict()
# load config with dict
config = load_transformer_config(
    config_dict, load_module_name='BertConfig')

In [None]:
# export
def load_transformer_model(model_name_or_config, load_module_name=None):
    if load_module_name:
        load_module = getattr(transformers, load_module_name)
    else:
        load_module = transformers.TFAutoModel

    if isinstance(model_name_or_config, str):
        try:
            model = load_module.from_pretrained(
                model_name_or_config, output_attentions=True, output_hidden_states=True)
        except OSError:
            model = load_module.from_pretrained(
                model_name_or_config, from_pt=True, output_attentions=True, output_hidden_states=True)
    else:
        model = load_module(model_name_or_config)
    return model

In [None]:
# load by name(load weights)
# this is a pt only model
model = load_transformer_model(
    'voidful/albert_chinese_tiny')

# load by config (not load weights)
model = load_transformer_model(load_transformer_config(
    'bert-base-chinese'), 'TFBertModel')

404 Client Error: Not Found for url: https://huggingface.co/voidful/albert_chinese_tiny/resolve/main/tf_model.h5
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.decoder.weight', 'predictions.dense.bias', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was tr

In [None]:
# export 
def get_label_encoder_save_path(params, problem: str) -> str:
    if not hasattr(params, 'ckpt_dir'):
        raise ValueError('Seems no problem assigned to params. Please check.')
    problem_path = params.ckpt_dir
    create_path(problem_path)
    problem_type = params.problem_type[problem]
    le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem)
    return le_path

In [None]:
# export
class LabelEncoder(BaseEstimator, TransformerMixin):

    def fit(self, y):
        """Fit label encoder
        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Target values.
        Returns
        -------
        self : returns an instance of self.
        """
        self.encode_dict = {}
        self.decode_dict = {}
        label_set = set(y)

        label_set = sorted(list(label_set))

        for l_ind, l in enumerate(label_set):

            new_ind = l_ind

            self.encode_dict[l] = new_ind
            self.decode_dict[new_ind] = l

        return self

    def fit_transform(self, y):
        """Fit label encoder and return encoded labels
        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.
        Returns
        -------
        y : array-like of shape [n_samples]
        """
        self.fit(y)
        y = self.transform(y)
        return y

    def transform(self, y):
        """Transform labels to normalized encoding.
        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.
        Returns
        -------
        y : array-like of shape [n_samples]
        """
        encode_y = []
        for l in y:
            encode_y.append(self.encode_dict[l])

        return np.array(encode_y)

    def inverse_transform(self, y):
        """Transform labels back to original encoding.
        Parameters
        ----------
        y : numpy array of shape [n_samples]
            Target values.
        Returns
        -------
        y : numpy array of shape [n_samples]
        """
        decode_y = []
        for l in y:
            decode_y.append(self.decode_dict[l])

        return np.array(decode_y)

    def dump(self, path):
        with open(path, 'wb') as f:
            pickle.dump(self.decode_dict, f)

    def load(self, path):
        with open(path, 'rb') as f:
            self.decode_dict = pickle.load(f)

        self.encode_dict = {v: k for k, v in self.decode_dict.items()}


def create_path(path):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

def need_make_label_encoder(mode: str, le_path: str, overwrite=False):
    return mode == TRAIN and (not os.path.exists(le_path) or overwrite)

def get_or_make_label_encoder(params, problem: str, mode: str, label_list=None, overwrite=True) -> Union[LabelEncoder, MultiLabelBinarizer, PreTrainedTokenizer]:
    """Function to unify ways to get or create label encoder for various
    problem type.

    cls: LabelEncoder
    seq_tag: LabelEncoder
    multi_cls: MultiLabelBinarizer
    seq2seq_text: Tokenizer

    Arguments:
        problem {str} -- problem name
        mode {mode} -- mode

    Keyword Arguments:
        label_list {list} -- label list to fit the encoder (default: {None})

    Returns:
        LabelEncoder -- label encoder
    """

    problem_path = params.ckpt_dir
    create_path(problem_path)
    problem_type = params.problem_type[problem]

    get_or_make_custom_le_fn = params.get_or_make_label_encoder_fn_dict[
        problem_type]
    label_encoder = get_or_make_custom_le_fn(params, problem, mode, label_list, overwrite=overwrite)

    params.set_problem_info(problem=problem, info_name='label_encoder_called', info=True)
    return label_encoder

In [None]:
le_train = get_or_make_label_encoder(
    params=params, problem='weibo_fake_ner', mode=m3tl.TRAIN, label_list=[['a', 'b'], ['c']]
)
# seq_tag will add [PAD]
assert len(le_train.encode_dict) == 4, le_train.encode_dict

le_predict = get_or_make_label_encoder(
    params=params, problem='weibo_fake_ner', mode=m3tl.PREDICT)
assert le_predict.encode_dict==le_train.encode_dict

# list train
le_train = get_or_make_label_encoder(
    params=params, problem='weibo_fake_cls', mode=m3tl.TRAIN, label_list=['a', 'b', 'c']
)
# seq_tag will add [PAD]
assert len(le_train.encode_dict) == 3

le_predict = get_or_make_label_encoder(
    params=params, problem='weibo_fake_cls', mode=m3tl.PREDICT)
assert le_predict.encode_dict==le_train.encode_dict

# text
le_train = get_or_make_label_encoder(
    params=params, problem='weibo_masklm', mode=m3tl.TRAIN)
assert isinstance(le_train, transformers.PreTrainedTokenizer)
le_predict = get_or_make_label_encoder(
    params=params, problem='weibo_masklm', mode=m3tl.PREDICT)
assert isinstance(le_predict, transformers.PreTrainedTokenizer)


In [None]:
# export
def cluster_alphnum(text: str) -> list:
    """Simple funtions to aggregate eng and number

    Arguments:
        text {str} -- input text

    Returns:
        list -- list of string with chinese char or eng word as element
    """
    return_list = []
    last_is_alphnum = False

    for char in text:
        is_alphnum = bool(re.match('^[a-zA-Z0-9\[]+$', char))
        is_right_brack = char == ']'
        if is_alphnum:
            if last_is_alphnum:
                return_list[-1] += char
            else:
                return_list.append(char)
                last_is_alphnum = True
        elif is_right_brack:
            if return_list:
                return_list[-1] += char
            else:
                return_list.append(char)
            last_is_alphnum = False
        else:
            return_list.append(char)
            last_is_alphnum = False
    return return_list


def filter_empty(input_list, target_list):
    """Filter empty inputs or targets

    Arguments:
        input_list {list} -- input list
        target_list {list} -- target list

    Returns:
        input_list, target_list -- data after filter
    """
    return_input, return_target = [], []
    for inp, tar in zip(input_list, target_list):
        if inp and tar:
            return_input.append(inp)
            return_target.append(tar)
    return return_input, return_target

In [None]:
# export 

def infer_shape_and_type_from_dict(inp_dict: dict, fix_dim_for_high_rank_tensor=True):
    shape_dict = {}
    type_dict = {}
    for feature_name, feature in inp_dict.items():
        if type(feature) is list:
            feature = np.array(feature)
        if type(feature) is np.ndarray:
            if np.issubdtype(feature.dtype, np.integer):
                type_dict[feature_name] = tf.int32
            elif np.issubdtype(feature.dtype, np.floating):
                type_dict[feature_name] = tf.float32

            # this seems not a good idea
            if len(feature.shape) > 1 and fix_dim_for_high_rank_tensor:
                shape_dict[feature_name] = [
                    None] + list(feature.shape[1:])
            else:
                shape_dict[feature_name] = [
                    None for _ in feature.shape]

        elif np.issubdtype(type(feature), np.floating):

            type_dict[feature_name] = tf.float32
            shape_dict[feature_name] = []
        elif np.issubdtype(type(feature), np.integer):

            type_dict[feature_name] = tf.int32
            shape_dict[feature_name] = []
        else:
            if isinstance(feature, str):
                feature = feature.encode('utf8')

            type_dict[feature_name] = tf.string
            shape_dict[feature_name] = []
    return shape_dict, type_dict

In [None]:
# dose not support nested dict
test_dict = {
    'test1': np.random.uniform(size=(64, 32)),
    'test2': np.array([1, 2, 3], dtype='int32'),
    'test5': 5
}
desc_dict = infer_shape_and_type_from_dict(
    test_dict)
assert desc_dict == ({'test1': [None, 32], 'test2': [None], 'test5': []}, {
                    'test1': tf.float32, 'test2': tf.int32, 'test5': tf.int32})

In [None]:
# export
def get_transformer_main_model(model, key='embeddings'):
    """Function to extrac model name from huggingface transformer models.

    Args:
        model (Model): Huggingface transformers model
        key (str, optional): Key to identify model. Defaults to 'embeddings'.

    Returns:
        model
    """

    for attr_name, attr in getmembers(model):
        if attr_name == key:
            return model
        if hasattr(attr, key):
            return attr

In [None]:
model = load_transformer_model(
    'voidful/albert_chinese_tiny')
main_model = get_transformer_main_model(model)
isinstance(main_model, transformers.TFAlbertMainLayer)

404 Client Error: Not Found for url: https://huggingface.co/voidful/albert_chinese_tiny/resolve/main/tf_model.h5
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.decoder.weight', 'predictions.dense.bias', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was tr

True

In [None]:
# export 
def get_embedding_table_from_model(model: TFPreTrainedModel):
    embedding_layer = model.get_input_embeddings()
    return embedding_layer.weights[0]

In [None]:
embedding = get_embedding_table_from_model(
    model)
assert embedding.shape == (21128, 128)

In [None]:
# export
def get_shape_list(tensor, expected_rank=None, name=None):
    """Returns a list of the shape of tensor, preferring static dimensions.

    Args:
      tensor: A tf.Tensor object to find the shape of.
      expected_rank: (optional) int. The expected rank of `tensor`. If this is
        specified and the `tensor` has a different rank, and exception will be
        thrown.
      name: Optional name of the tensor for the error message.

    Returns:
      A list of dimensions of the shape of tensor. All static dimensions will
      be returned as python integers, and dynamic dimensions will be returned
      as tf.Tensor scalars.
    """
    shape = tensor.shape.as_list()

    non_static_indexes = []
    for (index, dim) in enumerate(shape):
        if dim is None:
            non_static_indexes.append(index)

    if not non_static_indexes:
        return shape

    dyn_shape = tf.shape(input=tensor)
    for index in non_static_indexes:
        shape[index] = dyn_shape[index]
    return shape


def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_offsets = tf.cast(flat_offsets, tf.int64)
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    # https://github.com/tensorflow/tensorflow/issues/36236
    output_tensor = tf.gather(flat_sequence_tensor*1, flat_positions)
    return output_tensor

In [None]:
# export 
def dispatch_features(features, hidden_feature, problem, mode):
    # get features with ind == 1
    if mode == tf.estimator.ModeKeys.PREDICT:
        feature_this_round = features
        hidden_feature_this_round = hidden_feature
    else:
        multiplier_name = '%s_loss_multiplier' % problem

        # record_ind = tf.where(tf.cast(
        #     tf.squeeze(features[multiplier_name]), tf.bool))

        record_ind = tf.where(tf.cast(features[multiplier_name], tf.bool))

        hidden_feature_this_round = {}
        for hidden_feature_name in hidden_feature:
            if hidden_feature_name != 'embed_table':
                hidden_feature_this_round[hidden_feature_name] = tf.squeeze(tf.gather(
                    hidden_feature[hidden_feature_name], record_ind, axis=0
                ), axis=1)
                hidden_feature_this_round[hidden_feature_name].set_shape(
                    [None, *hidden_feature[hidden_feature_name].shape.as_list()[1:]])
            else:
                hidden_feature_this_round[hidden_feature_name] = hidden_feature[hidden_feature_name]

        feature_this_round = {}
        for features_name in features:
            feature_this_round[features_name] = tf.gather_nd(
                features[features_name],
                record_ind)

    return feature_this_round, hidden_feature_this_round

In [None]:
# export
def _add_to_dict(loss_dict, model, added_name, ele_name):
    ele_list = getattr(model, ele_name)
    for ele in ele_list:
        try:
            ele_tf_name = ele.name
            if ele_tf_name not in added_name:
                loss_dict[ele_name].append(ele)
                added_name.append(ele_tf_name)
        except AttributeError:
            if ele not in added_name:
                loss_dict[ele_name].append(ele)
                added_name.append(ele)
    return loss_dict, added_name

def create_dict_from_nested_model(model: tf.keras.Model, loss_dict=None, ele_name='losses', added_name=None) -> dict:
    if loss_dict is None:
        loss_dict = defaultdict(list)

    if added_name is None:
        added_name = []
    if not isinstance(model, tf.keras.Model):
        loss_dict, added_name = _add_to_dict(loss_dict, model, added_name, ele_name)

    else:
        for layer in model.layers:
            layer_loss_dict = loss_dict.get(layer.name, None)
            layer_loss_dict = create_dict_from_nested_model(layer, layer_loss_dict, ele_name, added_name)
            loss_dict[layer.name] = layer_loss_dict
        loss_dict, added_name = _add_to_dict(loss_dict, model, added_name, ele_name)

    # remove empty
    loss_dict_keys = list(loss_dict.keys())
    for k in loss_dict_keys:
        if not loss_dict[k]:
            del loss_dict[k]
    return loss_dict



In [None]:
# export

def variable_summaries(var, name):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope(name):
        mean = tf.reduce_mean(input_tensor=var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(
                input_tensor=tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(input_tensor=var))
        tf.summary.scalar('min', tf.reduce_min(input_tensor=var))
        tf.summary.histogram('histogram', var)

In [None]:
# export
M3TL_PHASE = TRAIN
def set_phase(phase: str):
    if phase not in [TRAIN, EVAL, PREDICT]:
        raise ValueError('expect one of following values: {}, got: {}'.format([TRAIN, EVAL, PREDICT], phase))
    logger.info('Setting phase to {}'.format(phase))
    global M3TL_PHASE
    M3TL_PHASE = phase

def get_phase() -> str:
    return M3TL_PHASE

IS_PYSPARK = False
def set_is_pyspark(is_pyspark: bool):
    global IS_PYSPARK
    IS_PYSPARK = is_pyspark

def get_is_pyspark() -> bool:
    return IS_PYSPARK

In [None]:
# export

class TFRedundantWarningFilter(logging.Filter):
    def filter(self, record: logging.LogRecord) -> bool:
        msg = record.msg
        lvl = record.levelname
        if lvl in ('WARNING', 'U'):
            if ".optimizer's state" in msg:
                return False

            if "AutoGraph could not" in msg:
                return False 

            if "Converting sparse IndexedSlices" in msg:
                return False
        return True

def compress_tf_warnings():
    warnings.simplefilter('ignore')
    tf.get_logger().addFilter(TFRedundantWarningFilter())