In [1]:
!pip install pyarrow

Collecting pyarrow
[?25l  Downloading https://files.pythonhosted.org/packages/36/94/23135312f97b20d6457294606fb70fad43ef93b7bffe567088ebe3623703/pyarrow-0.11.1-cp36-cp36m-manylinux1_x86_64.whl (11.6MB)
[K    100% |████████████████████████████████| 11.6MB 2.9MB/s 
Installing collected packages: pyarrow
Successfully installed pyarrow-0.11.1


In [2]:
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']  

Cloning into 'bert_repo'...
remote: Enumerating objects: 299, done.[K
remote: Total 299 (delta 0), reused 0 (delta 0), pack-reused 299[K
Receiving objects: 100% (299/299), 184.80 KiB | 7.39 MiB/s, done.
Resolving deltas: 100% (178/178), done.


In [0]:
import os
import tokenization
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

from google.colab import auth
auth.authenticate_user()

In [0]:
BERT_PRETRAINED_DIR = 'gs://dev-test-bert-tpu/chinese_bert/'
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
BERT_MODEL = 'chinese_L-12_H-768_A-12'
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

In [5]:
!mkdir -p ./dataset && gsutil rsync gs://dev-test-bert-tpu/dataset ./dataset
  
article_contents = pd.read_csv('./dataset/article_contents.csv').set_index('article_id')
article_contents = article_contents[~article_contents.main_content.isnull()]
article_contents.head()

train, test = train_test_split(article_contents, test_size=0.2, shuffle=False)

class InputExample(object):
  """A single training/test example for simple sequence classification."""

  def __init__(self, guid, text_a, text_b=None, label=None):
    """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.label = label

def get_train_examples():
  examples = []
  def append_example(row):
    examples.append(InputExample(row.name, row.main_content))
  
  train.apply(append_example, axis=1)
  return examples


def get_test_examples():
  examples = []
  def append_example(row):
    examples.append(InputExample(row.name, row.main_content))
  
  test.apply(append_example, axis=1)
  return examples



train_examples = get_train_examples()
test_examples = get_test_examples()
train_examples[:10], len(train_examples)

Building synchronization state...
Starting synchronization...
Copying gs://dev-test-bert-tpu/dataset/article_contents.csv...
Copying gs://dev-test-bert-tpu/dataset/article_contents.txt...
Copying gs://dev-test-bert-tpu/dataset/train_features.pkl...
| [3 files][  1.7 GiB/  1.7 GiB]   75.0 MiB/s                                   
Operation completed over 3 objects/1.7 GiB.                                      


([<__main__.InputExample at 0x7fe2a9272f98>,
  <__main__.InputExample at 0x7fe2a926f160>,
  <__main__.InputExample at 0x7fe2a926f1d0>,
  <__main__.InputExample at 0x7fe2a926f278>,
  <__main__.InputExample at 0x7fe2a926f9e8>,
  <__main__.InputExample at 0x7fe2a926f940>,
  <__main__.InputExample at 0x7fe2a926f3c8>,
  <__main__.InputExample at 0x7fe2a926f0f0>,
  <__main__.InputExample at 0x7fe2a926f438>,
  <__main__.InputExample at 0x7fe2a926f048>],
 174664)

In [6]:
# For char count frequency
keras_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, oov_token='<oov>', split='')
keras_tokenizer.fit_on_texts(train.main_content)
len(keras_tokenizer.index_word), keras_tokenizer.word_index['<oov>']

choice = []
p = []

for k, v in keras_tokenizer.word_counts.items():
  if k in tokenizer.vocab:
    choice.append(k)
    p.append(v)

p = p / np.sum(p)

choice[:10], p[:10]

(['英', '超', '曼', '聯', '今', '晨', '主', '場', '出', '擊'],
 array([0.00076436, 0.00048691, 0.00012413, 0.00073227, 0.00169406,
        0.00012515, 0.00210093, 0.00184991, 0.00373265, 0.00049875]))

In [7]:
class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self,
               input_ids,
               input_mask,
               segment_ids,
               truths,
               is_real_example=True):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.is_real_example = is_real_example
    self.truths = truths

    
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

def convert_single_example(ex_index, example, max_seq_length,
                           tokenizer):
  """Converts a single `InputExample` into a single `InputFeatures`."""
  tokens_a = tokenizer.tokenize(example.text_a)
  
  MASK_PROB = 0.15 #@param {type:"number"}
  assert MASK_PROB >= 0. and MASK_PROB <= 1
  MASK_ERROR_PROB = 0.8 #@param {type:"number"}

  random_tokens = np.random.choice(choice, len(tokens_a), p=p)
  random_mask = np.random.choice([0., 1.], len(tokens_a), p=[1-MASK_PROB, MASK_PROB])

  aug_tokens_a = np.ma.array(tokens_a, mask=random_mask).filled(random_tokens)
  
  tokens_a = aug_tokens_a
  tokens_a_truth = random_mask
  
  tokens_b = None
  if example.text_b:
    tokens_b = tokenizer.tokenize(example.text_b)

  if tokens_b:
    # Modifies `tokens_a` and `tokens_b` in place so that the total
    # length is less than the specified length.
    # Account for [CLS], [SEP], [SEP] with "- 3"
    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
  else:
    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens_a) > max_seq_length - 2:
      tokens_a = tokens_a[0:(max_seq_length - 2)]

  # The convention in BERT is:
  # (a) For sequence pairs:
  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
  # (b) For single sequences:
  #  tokens:   [CLS] the dog is hairy . [SEP]
  #  type_ids: 0     0   0   0  0     0 0
  #
  # Where "type_ids" are used to indicate whether this is the first
  # sequence or the second sequence. The embedding vectors for `type=0` and
  # `type=1` were learned during pre-training and are added to the wordpiece
  # embedding vector (and position vector). This is not *strictly* necessary
  # since the [SEP] token unambiguously separates the sequences, but it makes
  # it easier for the model to learn the concept of sequences.
  #
  # For classification tasks, the first vector (corresponding to [CLS]) is
  # used as as the "sentence vector". Note that this only makes sense because
  # the entire model is fine-tuned.
  tokens = []
  segment_ids = []
  truths = []
  
  tokens.append("[CLS]")
  segment_ids.append(0)
  truths.append(0.)
  
  for token, truth in zip(tokens_a, tokens_a_truth):
    tokens.append(token)
    segment_ids.append(0)
    truths.append(truth)
    
  tokens.append("[SEP]")
  segment_ids.append(0)
  truths.append(0.)

  if tokens_b:
    for token in tokens_b:
      tokens.append(token)
      segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

  input_ids = tokenizer.convert_tokens_to_ids(tokens)

  # The mask has 1 for real tokens and 0 for padding tokens. Only real
  # tokens are attended to.
  input_mask = [1] * len(input_ids)

  # Zero-pad up to the sequence length.
  while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)
    truths.append(0.)

  assert len(input_ids) == max_seq_length
  assert len(input_mask) == max_seq_length
  assert len(segment_ids) == max_seq_length
  assert len(truths) == max_seq_length

  if ex_index < 3:
    tf.logging.info("*** Example ***")
    tf.logging.info("guid: %s" % (example.guid))
    tf.logging.info("tokens: %s" % " ".join(
        [tokenization.printable_text(x) for x in tokens]))
    tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
    tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
    tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
    tf.logging.info("truths: %s" % " ".join([str(x) for x in truths]))

  feature = InputFeatures(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids,
      truths=truths
  )
  return feature


def convert_examples_to_features(examples, max_seq_length,
                                 tokenizer):
  """Convert a set of `InputExample`s to a list of `InputFeatures`."""

  features = []
  for (ex_index, example) in enumerate(examples):
    if ex_index % 1000 == 0:
      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

    feature = convert_single_example(ex_index, example,
                                     max_seq_length, tokenizer)

    features.append(feature)
  return features  

print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')

MAX_SEQ_LENGTH = 128 #@param {'type': 'number'}

train_features = convert_examples_to_features(train_examples, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_features(test_examples, MAX_SEQ_LENGTH, tokenizer)
train_features[:10], len(train_features)

MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...
INFO:tensorflow:Writing example 0 of 174664
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: 3
INFO:tensorflow:tokens: [CLS] 弟 超 曼 聯 論 晨 主 場 出 擊 ， 只 能 與 剛 剛 上 任 8 車 路 士 的 領 隊 軒 迪 克 飼 和 0 : 0 ， 近 8 場 未 嘗 勝 績 ， 備 受 批 評 的 紅 4 領 也 雲 高 爾 賽 後 卻 表 示 絕 不 辭 職 。 性 ， 曼 聯 有 中 場 舒 韋 恩 史 迪 加 停 賽 復 出 ， 而 兩 路 士 亦 有 中 場 夏 薩 特 能 任 為 選 ， 主 隊 地 段 便 獲 得 宣 金 對 會 ， 但 麼 場 接 馬 達 及 清 鋒 會 籌 尼 馬 迪 爾 先 [SEP]
INFO:tensorflow:input_ids: 101 2475 6631 3294 5474 6316 3247 712 1842 1139 3080 8024 1372 5543 5645 1190 1190 677 818 129 6722 6662 1894 4638 7526 7386 6726 6832 1046 7615 1469 121 131 121 8024 6818 129 1842 3313 1655 1245 5245 8024 991 1358 2821 6268 4638 5148 125 7526 738 7437 7770 4273 6555 2527 1320 6134 4850 5179 679 6798 5480 511 2595 8024 3294 5474 3300 704 1842 5653 7500 2617 1380 6832 1217 977 6555 2541 1139 8024 5445 1060 6662 1894 771 3300 704 1842 1909 5958 4294 5543 818 4158 6908 8024 712 7386 1765 3667 912 4363 253

([<__main__.InputFeatures at 0x7fe2a6566e10>,
  <__main__.InputFeatures at 0x7fe2a6566f28>,
  <__main__.InputFeatures at 0x7fe2a6566dd8>,
  <__main__.InputFeatures at 0x7fe2a6566fd0>,
  <__main__.InputFeatures at 0x7fe2a6566eb8>,
  <__main__.InputFeatures at 0x7fe2a6566d68>,
  <__main__.InputFeatures at 0x7fe2a6566da0>,
  <__main__.InputFeatures at 0x7fe2a6566f98>,
  <__main__.InputFeatures at 0x7fe2a6566b38>,
  <__main__.InputFeatures at 0x7fe2a65669e8>],
 174664)

In [9]:
import pickle

with open('./dataset/train_features_%s.pkl'%MAX_SEQ_LENGTH, 'wb') as f:
  pickle.dump(train_features, f)

  
with open('./dataset/test_features_%s.pkl'%MAX_SEQ_LENGTH, 'wb') as f:
  pickle.dump(test_features, f)

  
!gsutil rsync ./dataset gs://dev-test-bert-tpu/dataset

Building synchronization state...
Starting synchronization...
Copying file://./dataset/test_features_128.pkl [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://./dataset/train_features_128.pkl [Content-Type=application/octet-stream]...
-
Operation completed over 2 objects/1009.5 MiB.                                 