# Finetuning of the pretrained Japanese BERT model

Finetune the pretrained model to solve multi-class classification problems.  
This notebook requires the following objects:
- trained sentencepiece model (model and vocab files)
- pretraiend Japanese BERT model

We make test:dev:train = 2:2:6 datasets.

In [1]:
import configparser
import glob
import os
import pandas as pd
import subprocess
import sys
import tarfile
import pandas as pd
from urllib.request import urlretrieve

CURDIR = os.getcwd()
CONFIGPATH = os.path.join(CURDIR, os.pardir, 'config.ini')
config = configparser.ConfigParser()
config.read(CONFIGPATH)

['/home/ubuntu/workspace/bert-japanese/notebook/../config.ini']

## Data preparing

You need execute the following cells just once.

In [2]:
FILEURL = config['HIS-DATA']['DATADIR']
EXTRACTDIR = config['HIS-DATA']['TEXTDIR']

In [3]:
df = pd.read_csv(FILEURL, delimiter='\t')

In [4]:
df.head()

Unnamed: 0,label,text
0,532,ソウルの渡航条件は？
1,503,今のおススメはある？
2,65,主人と別々に会員登録したいんですけど、２人のメアドがいっしょでもできますか？
3,110,海外お土産や旅行用品を希望の日時に郵送してもらいたいのですが。
4,397,旅行のキャンセル料が、旅行代金より高くなることはありますか？


Save data as tsv files.  
test:dev:train = 2:2:6. To check the usability of finetuning, we also prepare sampled training data (1/5 of full training data).

In [5]:
df[:len(df) // 10].to_csv( os.path.join(EXTRACTDIR, "test.tsv"), sep='\t', index=False)
df[len(df) // 10:len(df)*2 // 10].to_csv( os.path.join(EXTRACTDIR, "dev.tsv"), sep='\t', index=False)
df[len(df)*2 // 10:].to_csv( os.path.join(EXTRACTDIR, "train.tsv"), sep='\t', index=False)

### 1/5 of full training data.
# df[:len(df) // 5].to_csv( os.path.join(EXTRACTDIR, "test.tsv"), sep='\t', index=False)
# df[len(df) // 5:len(df)*2 // 5].to_csv( os.path.join(EXTRACTDIR, "dev.tsv"), sep='\t', index=False)
# df[len(df)*2 // 5:].sample(frac=0.2, random_state=23).to_csv( os.path.join(EXTRACTDIR, "train.tsv"), sep='\t', index=False)

## Finetune pre-trained model

It will take a lot of hours to execute the following cells on CPU environment.  
You can also use colab to recieve the power of TPU. You need to uplode the created data onto your GCS bucket.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zZH2GWe0U-7GjJ2w2duodFfEUptvHjcx)

In [2]:
PRETRAINED_MODEL_PATH = '../model/model.ckpt-1000000'
FINETUNE_OUTPUT_DIR = '../model/his_output'

In [3]:
output_ckpts = glob.glob("{}/model.ckpt*data*".format(FINETUNE_OUTPUT_DIR))
latest_ckpt = sorted(output_ckpts)[-1]
PRETRAINED_MODEL_PATH = latest_ckpt.split('.data-00000-of-00001')[0]

In [4]:
PRETRAINED_MODEL_PATH

'../model/his_output/model.ckpt-39000'

In [5]:
%%time
# It will take many hours on CPU environment.

!python3 ../src/run_classifier.py \
  --task_name=his \
  --do_train=true \
  --do_eval=true \
  --data_dir=../data/his \
  --model_file=../model/wiki-ja.model \
  --vocab_file=../model/wiki-ja.vocab \
  --init_checkpoint={PRETRAINED_MODEL_PATH} \
  --max_seq_length=32 \
  --train_batch_size=16 \
  --learning_rate=5e-5 \
  --num_train_epochs=20 \
  --output_dir={FINETUNE_OUTPUT_DIR}

Loaded a trained SentencePiece model.
INFO:tensorflow:Using config: {'_model_dir': '../model/his_output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8749da2940>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name

## Predict using the finetuned model

Let's predict test data using the finetuned model.  

In [6]:
import sys
sys.path.append("../src")

import tokenization_sentencepiece as tokenization
from run_classifier import HISProcessor
from run_classifier import model_fn_builder
from run_classifier import file_based_input_fn_builder
from run_classifier import file_based_convert_examples_to_features
from utils import str_to_value

In [7]:
sys.path.append("../bert")

import modeling
import optimization
import tensorflow as tf

In [8]:
import configparser
import json
import glob
import os
import pandas as pd
import tempfile

bert_config_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.json')
bert_config_file.write(json.dumps({k:str_to_value(v) for k,v in config['BERT-CONFIG'].items()}))
bert_config_file.seek(0)
bert_config = modeling.BertConfig.from_json_file(bert_config_file.name)

In [9]:
output_ckpts = glob.glob("{}/model.ckpt*data*".format(FINETUNE_OUTPUT_DIR))
latest_ckpt = sorted(output_ckpts)[-1]
FINETUNED_MODEL_PATH = latest_ckpt.split('.data-00000-of-00001')[0]

In [10]:
# FINETUNED_MODEL_PATH = '../model/his_output/model.ckpt-33000'
FINETUNED_MODEL_PATH

'../model/his_output/model.ckpt-40000'

In [11]:
class FLAGS(object):
    '''Parameters.'''
    def __init__(self):
        self.model_file = "../model/wiki-ja.model"
        self.vocab_file = "../model/wiki-ja.vocab"
        self.do_lower_case = True
        self.use_tpu = False
        self.output_dir = "/dummy"
        self.data_dir = "../data/his"
        self.max_seq_length = 64
        self.init_checkpoint = FINETUNED_MODEL_PATH
        self.predict_batch_size = 4
        
        # The following parameters are not used in predictions.
        # Just use to create RunConfig.
        self.master = None
        self.save_checkpoints_steps = 1
        self.iterations_per_loop = 1
        self.num_tpu_cores = 1
        self.learning_rate = 0
        self.num_warmup_steps = 0
        self.num_train_steps = 0
        self.train_batch_size = 0
        self.eval_batch_size = 0

In [12]:
FLAGS = FLAGS()

In [13]:
processor = HISProcessor()
label_list = processor.get_labels()

In [14]:
tokenizer = tokenization.FullTokenizer(
    model_file=FLAGS.model_file, vocab_file=FLAGS.vocab_file,
    do_lower_case=FLAGS.do_lower_case)

tpu_cluster_resolver = None

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=FLAGS.master,
    model_dir=FLAGS.output_dir,
    save_checkpoints_steps=FLAGS.save_checkpoints_steps,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_tpu_cores,
        per_host_input_for_training=is_per_host))

Loaded a trained SentencePiece model.


In [15]:
model_fn = model_fn_builder(
    bert_config=bert_config,
    num_labels=len(label_list),
    init_checkpoint=FLAGS.init_checkpoint,
    learning_rate=FLAGS.learning_rate,
    num_train_steps=FLAGS.num_train_steps,
    num_warmup_steps=FLAGS.num_warmup_steps,
    use_tpu=FLAGS.use_tpu,
    use_one_hot_embeddings=FLAGS.use_tpu)


estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=FLAGS.use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=FLAGS.train_batch_size,
    eval_batch_size=FLAGS.eval_batch_size,
    predict_batch_size=FLAGS.predict_batch_size)

INFO:tensorflow:Using config: {'_model_dir': '/dummy', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6871d8d898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_di

In [16]:
predict_examples = processor.get_test_examples(FLAGS.data_dir)
predict_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.tf_record')

file_based_convert_examples_to_features(predict_examples, label_list,
                                        FLAGS.max_seq_length, tokenizer,
                                        predict_file.name)

predict_drop_remainder = True if FLAGS.use_tpu else False

predict_input_fn = file_based_input_fn_builder(
    input_file=predict_file.name,
    seq_length=FLAGS.max_seq_length,
    is_training=False,
    drop_remainder=predict_drop_remainder)

INFO:tensorflow:Writing example 0 of 4296
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-1
INFO:tensorflow:tokens: [CLS] ▁ ソウル の 渡航 条件 は ? [SEP]
INFO:tensorflow:input_ids: 4 9 4421 10 22047 1031 11 3017 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: 532 (id = 532)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-2
INFO:tensorflow:tokens: [CLS] ▁今 のお ス ス メ は ある ? [SEP]
INFO:tensorflow:input_ids: 4 9099 4992 60 60 401 11 382 3017 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1

In [17]:
result = estimator.predict(input_fn=predict_input_fn)

In [18]:
%%time
# It will take a few hours on CPU environment.

result = list(result)

INFO:tensorflow:Could not find trained model in model_dir: /dummy, running initialization to predict.
Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (?, 64)
INFO:tensorflow:  name = input_mask, shape = (?, 64)
INFO:tensorflow:  name = is_real_example, shape = (?,)
INFO:tensorflow:  name = label_ids, shape = (?,)
INFO:tensorflow:  name = segment_ids, shape = (?, 64)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (32000, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  

In [19]:
result[:2]

[{'probabilities': array([4.85639612e-04, 6.14706893e-04, 4.16122144e-04, 1.23017665e-03,
         2.44979965e-05, 2.90688733e-03, 4.99869976e-03, 1.45811234e-02,
         5.02463547e-04, 4.87088400e-04, 1.95992776e-04, 1.76234171e-04,
         1.42338577e-05, 6.28136331e-04, 4.00650053e-04, 1.15163450e-03,
         4.06570500e-04, 3.39805876e-04, 5.63080609e-03, 1.51450816e-03,
         2.91109900e-04, 1.97184039e-04, 1.29429638e-04, 3.00003216e-03,
         2.42308085e-03, 1.56356327e-04, 6.43298030e-04, 2.13932639e-04,
         6.10648131e-05, 3.71956194e-05, 1.84467397e-04, 1.99013331e-04,
         6.56422751e-04, 1.35776791e-05, 2.00152892e-04, 4.97852561e-05,
         7.68852042e-05, 1.43743091e-05, 1.03230227e-03, 4.66304744e-04,
         7.00645789e-04, 1.72082437e-05, 8.09362798e-04, 2.83179659e-04,
         3.55601187e-05, 4.40805416e-05, 5.46655501e-05, 1.46876162e-04,
         1.28057401e-03, 9.74849245e-05, 8.07984639e-03, 4.51951623e-02,
         1.86495134e-03, 7.5142452

Read test data set and add prediction results.

In [20]:
import pandas as pd

In [21]:
test_df = pd.read_csv("../data/his/test.tsv", sep='\t')

In [22]:
test_df['predict'] = [ label_list[elem['probabilities'].argmax()] for elem in result ]

In [23]:
test_df.head(10)

Unnamed: 0,label,text,predict
0,532,ソウルの渡航条件は？,342
1,503,今のおススメはある？,503
2,65,主人と別々に会員登録したいんですけど、２人のメアドがいっしょでもできますか？,65
3,110,海外お土産や旅行用品を希望の日時に郵送してもらいたいのですが。,110
4,397,旅行のキャンセル料が、旅行代金より高くなることはありますか？,397
5,301,当日のタイムスケジュールが知りたい。,301
6,313,レンタカーのキャンセル料はいつからかかりますか,312
7,32,インターネットでカード番号を入れるのは安全なんですか？,32
8,86,webサイトの会員の退会方法が知りたいです。,86
9,250,スケジュールが変更になると、eチケットも新しくなるんですか？,250


In [24]:
sum( test_df['label'] == test_df['predict'] ) / len(test_df)

0.845903165735568

A littel more detailed check using `sklearn.metrics`.

In [25]:
!pip install scikit-learn

[33mYou are using pip version 10.0.1, however version 19.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [26]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [27]:
print(classification_report(test_df['label'], test_df['predict']))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         8
          1       1.00      0.89      0.94         9
          2       0.57      0.80      0.67         5
          3       1.00      0.50      0.67         2
          4       1.00      0.64      0.78        11
          5       1.00      1.00      1.00         5
          6       0.60      0.75      0.67         8
          7       0.71      0.71      0.71         7
          8       0.88      0.70      0.78        10
          9       1.00      1.00      1.00         6
         10       0.89      0.73      0.80        11
         11       0.67      0.67      0.67         6
         12       0.80      0.73      0.76        11
         13       0.75      0.82      0.78        11
         14       0.88      0.88      0.88         8
         15       0.75      0.60      0.67         5
         16       1.00      0.62      0.77         8
         17       0.29      0.71      0.42   

  'precision', 'predicted', average, warn_for)


In [28]:
print(confusion_matrix(test_df['label'], test_df['predict']))

[[8 0 0 ... 0 0 0]
 [0 8 1 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 9 0 0]
 [0 0 0 ... 0 6 0]
 [0 0 0 ... 0 0 4]]


### Simple baseline model.

In [25]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [26]:
train_df = pd.read_csv("../data/his/train.tsv", sep='\t')
dev_df = pd.read_csv("../data/his/dev.tsv", sep='\t')
test_df = pd.read_csv("../data/his/test.tsv", sep='\t')

In [28]:
!sudo apt-get install -q -y mecab libmecab-dev mecab-ipadic mecab-ipadic-utf8

Reading package lists...
Building dependency tree...
Reading state information...
libmecab-dev is already the newest version (0.996-1.2ubuntu1).
mecab is already the newest version (0.996-1.2ubuntu1).
mecab-ipadic is already the newest version (2.7.0-20070801+main-1).
mecab-ipadic-utf8 is already the newest version (2.7.0-20070801+main-1).
The following packages were automatically installed and are no longer required:
  libnvidia-container-tools libnvidia-container1 nvidia-container-runtime
  nvidia-container-runtime-hook
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [29]:
!pip install mecab-python3==0.7

[33mYou are using pip version 10.0.1, however version 19.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
import MeCab

In [31]:
m = MeCab.Tagger("-Owakati")

In [32]:
train_dev_df = pd.concat([train_df, dev_df])

In [33]:
train_dev_xs = train_dev_df['text'].apply(lambda x: m.parse(x))
train_dev_ys = train_dev_df['label']

test_xs = test_df['text'].apply(lambda x: m.parse(x))
test_ys = test_df['label']

In [34]:
vectorizer = TfidfVectorizer(max_features=750)
train_dev_xs_ = vectorizer.fit_transform(train_dev_xs)
test_xs_ = vectorizer.transform(test_xs)

The following set up is not exactly identical to that of BERT because inside Classifier it uses `train_test_split` with shuffle.  
In addition, parameters are not well tuned, however, we think it's enough to check the power of BERT.

In [54]:
%%time

# model = GradientBoostingClassifier(n_estimators=200,
#                                    validation_fraction=len(train_df)/len(dev_df),
#                                    n_iter_no_change=5,
#                                    tol=0.01,
#                                    random_state=23)

### 1/5 of full training data.
# model = GradientBoostingClassifier(n_estimators=200,
#                                    validation_fraction=len(dev_df)/len(train_df),
#                                    n_iter_no_change=5,
#                                    tol=0.01,
#                                    random_state=23)

from sklearn.svm import LinearSVC
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier

model = LinearSVC()

model.fit(train_dev_xs_, train_dev_ys)

CPU times: user 11.9 s, sys: 248 ms, total: 12.2 s
Wall time: 12.2 s


In [55]:
print(classification_report(test_ys, model.predict(test_xs_)))

             precision    recall  f1-score   support

          0       0.88      0.88      0.88         8
          1       0.89      0.89      0.89         9
          2       0.75      0.60      0.67         5
          3       0.17      0.50      0.25         2
          4       0.88      0.64      0.74        11
          5       1.00      1.00      1.00         5
          6       0.86      0.75      0.80         8
          7       1.00      0.86      0.92         7
          8       0.88      0.70      0.78        10
          9       0.75      1.00      0.86         6
         10       0.89      0.73      0.80        11
         11       0.62      0.83      0.71         6
         12       0.89      0.73      0.80        11
         13       0.86      0.55      0.67        11
         14       0.88      0.88      0.88         8
         15       0.80      0.80      0.80         5
         16       0.83      0.62      0.71         8
         17       0.50      0.57      0.53   

In [56]:
print(confusion_matrix(test_ys, model.predict(test_xs_)))

[[7 0 0 ... 0 0 0]
 [0 8 1 ... 0 0 0]
 [0 1 3 ... 0 0 0]
 ...
 [0 0 0 ... 6 0 0]
 [0 0 0 ... 0 6 0]
 [0 0 0 ... 0 0 3]]
