In [0]:
!pip install pyarrow

In [0]:
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']  

In [0]:
import os
import tokenization
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

from google.colab import auth
auth.authenticate_user()

In [0]:
BERT_PRETRAINED_DIR = 'gs://dev-test-bert-tpu/chinese_bert/'
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
BERT_MODEL = 'chinese_L-12_H-768_A-12'
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

In [15]:
!mkdir -p ./dataset && gsutil rsync gs://dev-test-bert-tpu/dataset ./dataset
  
article_contents = pd.read_csv('./dataset/article_contents.csv').set_index('article_id')
article_contents = article_contents[~article_contents.main_content.isnull()]
article_contents.head()

train, test = train_test_split(article_contents, test_size=0.2, shuffle=False)

class InputExample(object):
  """A single training/test example for simple sequence classification."""

  def __init__(self, guid, text_a, text_b=None, label=None):
    """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.label = label

TRAIN_SIZE = -1 #@param {type:"number"}
    
def get_train_examples():
  examples = []
  def append_example(row):
    examples.append(InputExample(row.name, row.main_content))
  
  train[:TRAIN_SIZE].apply(append_example, axis=1)
  return examples


train_examples = get_train_examples()
train_examples[:10], len(train_examples)

Building synchronization state...
Starting synchronization...


([<__main__.InputExample at 0x7fa8780aa9e8>,
  <__main__.InputExample at 0x7fa8780aa978>,
  <__main__.InputExample at 0x7fa8780aa908>,
  <__main__.InputExample at 0x7fa8780aac18>,
  <__main__.InputExample at 0x7fa8780aac50>,
  <__main__.InputExample at 0x7fa8780aaba8>,
  <__main__.InputExample at 0x7fa8780aac88>,
  <__main__.InputExample at 0x7fa8780aab70>,
  <__main__.InputExample at 0x7fa8780aacc0>,
  <__main__.InputExample at 0x7fa8780aacf8>],
 174663)

In [16]:
# For char count frequency
keras_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, oov_token='<oov>', split='')
keras_tokenizer.fit_on_texts(train[:TRAIN_SIZE].main_content)
len(keras_tokenizer.index_word), keras_tokenizer.word_index['<oov>']

choice = []
p = []

for k, v in keras_tokenizer.word_counts.items():
  if k in tokenizer.vocab:
    choice.append(k)
    p.append(v)

p = p / np.sum(p)

choice[:10], p[:10]

(['英', '超', '曼', '聯', '今', '晨', '主', '場', '出', '擊'],
 array([0.00076436, 0.00048691, 0.00012413, 0.00073225, 0.00169406,
        0.00012515, 0.00210092, 0.00184992, 0.00373267, 0.00049875]))

In [17]:
class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self,
               input_ids,
               input_mask,
               segment_ids,
               truths,
               is_real_example=True):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.is_real_example = is_real_example
    self.truths = truths

    
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

def convert_single_example(ex_index, example, max_seq_length,
                           tokenizer):
  """Converts a single `InputExample` into a single `InputFeatures`."""
  tokens_a = tokenizer.tokenize(example.text_a)
  
  MASK_PROB = 0.15 #@param {type:"number"}
  assert MASK_PROB >= 0. and MASK_PROB <= 1
  MASK_ERROR_PROB = 0.8 #@param {type:"number"}

  random_tokens = np.random.choice(choice, len(tokens_a), p=p)
  random_mask = np.random.choice([0., 1.], len(tokens_a), p=[1-MASK_PROB, MASK_PROB])

  aug_tokens_a = np.ma.array(tokens_a, mask=random_mask).filled(random_tokens)
  
  tokens_a = aug_tokens_a
  tokens_a_truth = random_mask
  
  tokens_b = None
  if example.text_b:
    tokens_b = tokenizer.tokenize(example.text_b)

  if tokens_b:
    # Modifies `tokens_a` and `tokens_b` in place so that the total
    # length is less than the specified length.
    # Account for [CLS], [SEP], [SEP] with "- 3"
    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
  else:
    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens_a) > max_seq_length - 2:
      tokens_a = tokens_a[0:(max_seq_length - 2)]

  # The convention in BERT is:
  # (a) For sequence pairs:
  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
  # (b) For single sequences:
  #  tokens:   [CLS] the dog is hairy . [SEP]
  #  type_ids: 0     0   0   0  0     0 0
  #
  # Where "type_ids" are used to indicate whether this is the first
  # sequence or the second sequence. The embedding vectors for `type=0` and
  # `type=1` were learned during pre-training and are added to the wordpiece
  # embedding vector (and position vector). This is not *strictly* necessary
  # since the [SEP] token unambiguously separates the sequences, but it makes
  # it easier for the model to learn the concept of sequences.
  #
  # For classification tasks, the first vector (corresponding to [CLS]) is
  # used as as the "sentence vector". Note that this only makes sense because
  # the entire model is fine-tuned.
  tokens = []
  segment_ids = []
  truths = []
  
  tokens.append("[CLS]")
  segment_ids.append(0)
  truths.append(0.)
  
  for token, truth in zip(tokens_a, tokens_a_truth):
    tokens.append(token)
    segment_ids.append(0)
    truths.append(truth)
    
  tokens.append("[SEP]")
  segment_ids.append(0)
  truths.append(0.)

  if tokens_b:
    for token in tokens_b:
      tokens.append(token)
      segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

  input_ids = tokenizer.convert_tokens_to_ids(tokens)

  # The mask has 1 for real tokens and 0 for padding tokens. Only real
  # tokens are attended to.
  input_mask = [1] * len(input_ids)

  # Zero-pad up to the sequence length.
  while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)
    truths.append(0.)

  assert len(input_ids) == max_seq_length
  assert len(input_mask) == max_seq_length
  assert len(segment_ids) == max_seq_length
  assert len(truths) == max_seq_length

  if ex_index < 3:
    tf.logging.info("*** Example ***")
    tf.logging.info("guid: %s" % (example.guid))
    tf.logging.info("tokens: %s" % " ".join(
        [tokenization.printable_text(x) for x in tokens]))
    tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
    tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
    tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
    tf.logging.info("truths: %s" % " ".join([str(x) for x in truths]))

  feature = InputFeatures(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids,
      truths=truths
  )
  return feature


def convert_examples_to_features(examples, max_seq_length,
                                 tokenizer):
  """Convert a set of `InputExample`s to a list of `InputFeatures`."""

  features = []
  for (ex_index, example) in enumerate(examples):
    if ex_index % 1000 == 0:
      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

    feature = convert_single_example(ex_index, example,
                                     max_seq_length, tokenizer)

    features.append(feature)
  return features  

print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')

MAX_SEQ_LENGTH = 128 #@param {'type': 'number'}

train_features = convert_examples_to_features(train_examples, MAX_SEQ_LENGTH, tokenizer)
train_features[:10], len(train_features)

MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...
INFO:tensorflow:Writing example 0 of 174663
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: 3
INFO:tensorflow:tokens: [CLS] 英 超 完 聯 今 晨 主 場 出 擊 ， 只 能 大 剛 明 上 在 ， 車 路 士 的 體 隊 軒 迪 克 打 和 0 : 0 示 近 8 場 干 嘗 勝 績 ， 備 受 批 評 的 紅 魔 領 隊 雲 高 爾 賽 後 上 表 示 絕 不 辭 職 。 今 運 曼 聯 有 中 場 舒 韋 恩 有 迪 加 停 賽 c 出 ， 而 車 路 士 亦 有 中 場 用 炎 特 出 可 正 選 ， 主 隊 早 u 便 獲 省 黃 金 機 會 ， 但 中 場 桑 馬 士 及 前 鋒 安 東 尼 馬 迪 爾 先 [SEP]
INFO:tensorflow:input_ids: 101 5739 6631 2130 5474 791 3247 712 1842 1139 3080 8024 1372 5543 1920 1190 3209 677 1762 8024 6722 6662 1894 4638 7768 7386 6726 6832 1046 2802 1469 121 131 121 4850 6818 129 1842 2397 1655 1245 5245 8024 991 1358 2821 6268 4638 5148 7795 7526 7386 7437 7770 4273 6555 2527 677 6134 4850 5179 679 6798 5480 511 791 6880 3294 5474 3300 704 1842 5653 7500 2617 3300 6832 1217 977 6555 145 1139 8024 5445 6722 6662 1894 771 3300 704 1842 4500 4142 4294 1139 1377 3633 6908 8024 712 7386 3193 163 912 4363 468

([<__main__.InputFeatures at 0x7fa8780b90f0>,
  <__main__.InputFeatures at 0x7fa874faaf98>,
  <__main__.InputFeatures at 0x7fa874faafd0>,
  <__main__.InputFeatures at 0x7fa8783adeb8>,
  <__main__.InputFeatures at 0x7fa8780b93c8>,
  <__main__.InputFeatures at 0x7fa8780b9320>,
  <__main__.InputFeatures at 0x7fa8780b9400>,
  <__main__.InputFeatures at 0x7fa874fb20f0>,
  <__main__.InputFeatures at 0x7fa874fb2470>,
  <__main__.InputFeatures at 0x7fa874fb24a8>],
 174663)

In [18]:
import pickle

with open('./dataset/train_features.pkl', 'wb') as f:
  pickle.dump(train_features, f)
  
!gsutil rsync ./dataset gs://dev-test-bert-tpu/dataset

Building synchronization state...
Starting synchronization...
Copying file://./dataset/train_features.pkl [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\
Operation completed over 1 objects/807.2 MiB.                                    


In [0]:
# def generate_dataset(corpus, error_prob):
#   for article in corpus:
#     seqs = tokenizer.tokenize(article)
    
#   padded_seqs = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=SEQ_LENGTH, padding='post', truncating='post', value=0.0)
  
#   index_word_count = [(tokenizer.word_index[k], v) for k, v in tokenizer.word_counts.items()]
#   index_word_count.append((0, sum([x[1] for x in index_word_count]) / (error_prob) * (1 - error_prob)))
  
#   choice = np.array([x[0] for x in index_word_count])

#   p = np.array([x[1] for x in index_word_count])
#   p = p / np.sum(p)
  
#   # for each sequence, randomly replace the index
#   random_error = np.random.choice(choice, padded_seqs.shape, p=p)
#   random_error_mask = (random_error > 0).astype(int)
  
#   inp = padded_seqs * (1-random_error_mask) + random_error
#   out = random_error_mask
  
#   return inp, out

# ERROR_PROB = 0.15 #@param {'type': 'number'}
# generate_dataset(article_contents.main_content, ERROR_PROB)

In [11]:



# dataset = tf.data.TextLineDataset(['gs://dev-test-bert-tpu/dataset/article_contents.txt'])
# # dataset = dataset.map(tf.map_fn(tokenize, ))

# def tokenize(line):
#   return tokenizer.tokenize(line)

# dataset = dataset.map(lambda article: tf.py_func(tokenize, [article], [tf.string]))
# # dataset.output_classes == (tf.Tensor,)
# # dataset.output_types == (tf.string,)
# # dataset.output_shapes == ([128],)

# # dataset = dataset.map(lambda article: tf.py_func(tokenize, [article], [list]))
# # dataset.output_classes == (tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor)
# # dataset.output_types == (tf.string, tf.float32, tf.string, tf.float64)
# # dataset.output_shapes == ([], [], [3], [2])

# # dataset.map(lambda x: tokenizer.tokenize(x))

# iterator = dataset.make_one_shot_iterator()
# next_element = iterator.get_next()

# with tf.Session() as sess:
#   for i in range(10):
#     value = sess.run(next_element)
#     print(value.decode('utf-8'))

InvalidArgumentError: ignored