# Run Pre-defined Problems

This notebook is run on Google Colab. You need to manually upload the data to kaggle and pass the correct path to the pre-processing function.

In [None]:
!pip install tensorflow-gpu
!pip install tensorflow-addons==0.11.2
!pip install bert-multitask-learning==0.5.7b8
!pip install transformers==3.5.1



In [None]:
import sys
sys.path.insert(0, "../")
import tensorflow as tf
import transformers
from bert_multitask_learning import train_bert_multitask, train_eval_input_fn, BertMultiTask, DynamicBatchSizeParams
from bert_multitask_learning.predefined_problems import get_weibo_ner_fn, get_weibo_cws_fn

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
problem_type_dict = {
    'weibo_cws': 'seq_tag',
    'weibo_ner': 'seq_tag'
}

# data 
processing_fn_dict = {
    'weibo_ner': get_weibo_ner_fn(file_path='../data/ner/weiboNER_2nd_conll*'),
    'weibo_cws': get_weibo_cws_fn(file_path='../data/ner/weiboNER_2nd_conll*')
}

## Train Models
If you don't want to control every thing, you can just call `train_bert_multitask` function. Please note that starting from 0.4.2, transformer models (the body model) are implemented using [huggingface transformers](https://github.com/huggingface/transformers) and because of that, now we can basically use all transformer models by specifying following params(below is the default value):

    params.transformer_model_name = 'bert-base-chinese'
    params.transformer_tokenizer_name = 'bert-base-chinese'
    params.transformer_config_name = 'bert-base-chinese'
    params.transformer_model_loading = 'TFAutoModel'
    params.transformer_config_loading = 'BertConfig'
    params.transformer_tokenizer_loading = 'AutoTokenizer'

  And for decoder:

    params.transformer_decoder_model_name = None
    params.transformer_decoder_config_name = None
    params.transformer_decoder_tokenizer_name = None
    params.transformer_decoder_model_loading = 'TFAutoModel'
    params.transformer_decoder_config_loading = 'BertConfig'
    params.transformer_decoder_tokenizer_loading = 'AutoTokenizer'


In [5]:
# here we use the default model which is bert-base-chinese
params = DynamicBatchSizeParams()
# AutoConfig cannot load from dict...
params.transformer_config_loading = 'BertConfig'
params.transformer_model_name = 'bert-base-chinese'
params.transformer_tokenizer_name = 'bert-base-chinese'
params.transformer_tokenizer_loading = 'BertTokenizer'
train_bert_multitask(problem='weibo_ner&weibo_cws', params=params, problem_type_dict=problem_type_dict, processing_fn_dict=processing_fn_dict, num_gpus=1, num_epochs=10)



Adding new problem weibo_cws, problem type: seq_tag
Adding new problem weibo_ner, problem type: seq_tag
INFO:tensorflow:sampling weights: 
INFO:tensorflow:weibo_cws_weibo_ner: 1.0
INFO:tensorflow:sampling weights: 
INFO:tensorflow:weibo_cws_weibo_ner: 1.0
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


INFO:tensorflow:Initial lr: 2e-05
INFO:tensorflow:Train steps: 80
INFO:tensorflow:Warmup steps: 8
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 1/10
Instructions for updating:
Use `tf.data.Iterator.ge

<bert_multitask_learning.model_fn.BertMultiTask at 0x7fe707e5e128>

If you want to take more control of the training process, you can use lower level api

In [6]:
import tensorflow as tf
from bert_multitask_learning import train_eval_input_fn

problem = 'weibo_ner&weibo_cws'
num_gpus = 1
bert_multitask_params = DynamicBatchSizeParams()
bert_multitask_params.transformer_config_loading = 'BertConfig'
bert_multitask_params.transformer_model_name = 'bert-base-chinese'
bert_multitask_params.transformer_tokenizer_name = 'bert-base-chinese'
bert_multitask_params.transformer_tokenizer_loading = 'BertTokenizer'

bert_multitask_params.add_multiple_problems(
    problem_type_dict=problem_type_dict, processing_fn_dict=processing_fn_dict)

# assign problem to params
bert_multitask_params.train_epoch = 1
bert_multitask_params.assign_problem(problem, gpu=1)



Adding new problem weibo_cws, problem type: seq_tag
Adding new problem weibo_ner, problem type: seq_tag


In [7]:

dist_trategy = tf.distribute.MirroredStrategy()

# create dataset
train_dataset = train_eval_input_fn(bert_multitask_params)
eval_dataset = train_eval_input_fn(bert_multitask_params, mode=tf.estimator.ModeKeys.EVAL)

train_dataset = dist_trategy.experimental_distribute_dataset(
    train_dataset)
eval_dataset = dist_trategy.experimental_distribute_dataset(
    eval_dataset)

# create model
with dist_trategy.scope():
    model = BertMultiTask(params=bert_multitask_params)
    model.compile()
    model.fit(
        x=train_dataset,
        validation_data=eval_dataset,
        epochs=1,
        steps_per_epoch=8,
        validation_steps=1
    )


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:sampling weights: 
INFO:tensorflow:weibo_cws_weibo_ner: 1.0
INFO:tensorflow:sampling weights: 
INFO:tensorflow:weibo_cws_weibo_ner: 1.0


Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


INFO:tensorflow:Initial lr: 2e-05
INFO:tensorflow:Train steps: 84
INFO:tensorflow:Warmup steps: 8


In [None]:
# the saved model contains variables of optimizers which are not needed when doing prediction
# we can trim the model size by removing those variables
from bert_multitask_learning import trim_checkpoint_for_prediction
trim_checkpoint_for_prediction(problem=problem, input_dir='./models/weibo_cws_weibo_ner_ckpt', output_dir='./models/trimmed_ckpt', overwrite=True, problem_type_dict=problem_type_dict)

In [None]:
! du -sh ./models/weibo_cws_weibo_ner_ckpt
! du -sh ./models/trimmed_ckpt

## Evaluate and Predict

~~For NER and CWS, we need different evaluation logic.~~ Evaluation has bug and not fixed now.

In [10]:
from bert_multitask_learning import predict_bert_multitask

In [11]:
# predict
import numpy as np
from bert_multitask_learning.utils import get_or_make_label_encoder
# get prediction generator
pred_prob, model = predict_bert_multitask(
    inputs=['中国和美国在打贸易战']*10, 
    problem='weibo_cws&weibo_ner', 
    processing_fn_dict=processing_fn_dict,
    problem_type_dict=problem_type_dict,
    model_dir='./models/trimmed_ckpt',
    return_model=True)
predict_params = model.params
# get label encoder
ner_label_encoder = get_or_make_label_encoder(params=predict_params, problem='weibo_ner', mode='predict', label_list=[])
cws_label_encoder = get_or_make_label_encoder(params=predict_params, problem='weibo_cws', mode='predict', label_list=[])

for problem_name, prob in pred_prob.items():
    ner_pred = np.argmax(prob, axis = -1)
    print(ner_label_encoder.inverse_transform(ner_pred[0].tolist()))

Adding new problem weibo_cws, problem type: seq_tag
Adding new problem weibo_ner, problem type: seq_tag
INFO:tensorflow:Checkpoint dir: ./models/trimmed_ckpt
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:中国和美国在打贸易战
INFO:tensorflow:input_ids: [101, 704, 1744, 1469, 5401, 1744, 1762, 2802, 6588, 3211, 2773, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
INFO:tensorflow:中国和美国在打贸易战
INFO:tensorflow:input_ids: [101, 704, 1744, 1469, 5401, 1744, 1762, 2802, 6588, 3211, 2773, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
INFO:tensorflow:中国和美国在打贸易战
INFO:tensorflow:input_ids: [101, 704, 1744, 1469, 5401, 1744, 1762, 2802, 6588, 3211, 2773, 102]
INFO:tensorflow:input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
INFO:tensorflow:segment_ids: [0, 0, 0, 0,

In [None]:
# you can also make prediction using model directly
from bert_multitask_learning import predict_input_fn
predict_dataset = predict_input_fn(['中国和美国在打贸易战']*10, params)
pred_prob = model.predict(predict_dataset)