# Tokenizing for Tensorflow 2.x

## 1. Import libraries

In [1]:
import sys
sys.path.append('..')

In [2]:
from pprint import pprint
from pathlib import Path
import re
import numpy as np
import neologdn
import tensorflow as tf

import meguru_tokenizer
from meguru_tokenizer.vocab import Vocab
from meguru_tokenizer.whitespace_tokenizer import WhitespaceTokenizer
from meguru_tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer
from meguru_tokenizer.sudachi_tokenizer import SudachiTokenizer

"tenosrflow", tf.__version__, "tokenizer", meguru_tokenizer.__version__


('tenosrflow', '2.2.0', 'tokenizer', '0.2.0')

## Whitespace Tokenizer for English, German, whose sentence is splitted by space.

e.g. "word word word bra bra bra"


In [4]:
sentences = [
    "Hello, I don't know how to use it?",
    "Tensorflow is awesome!",
    "it is good framework.",
]
tokenizer = WhitespaceTokenizer(lower=True, language="en")
vocab = Vocab()

### building vocaburary

min_freq means vocaburary will contain the words whose frequency >= min_freq

vocab_size vocaburary will vocaburary size <= vocab_size

In [6]:
for sentence in sentences:
    vocab.add_vocabs(tokenizer.tokenize(sentence))
vocab.build_vocab(min_freq=None, vocab_size=None)

In [7]:
tokenizer.vocab= vocab

### dump and load vocab

In [8]:
vocab.dump_vocab(Path("vocab.txt"))
!cat vocab.txt

<pad>	0
<s>	1
</s>	2
<unk>	3
<mask>	4
it	5
is	6
hello	7
,	8
i	9
do	10
n't	11
know	12
how	13
to	14
use	15
?	16
tensorflow	17
awesome	18
!	19
good	20
framework	21
.	22


In [11]:
vocab = Vocab()
vocab.load_vocab(Path("vocab.txt"))
tokenizer.vocab = vocab

### encode text

In [10]:
tokenizer.tokenize_list(sentences)

[['hello', ',', 'i', 'do', "n't", 'know', 'how', 'to', 'use', 'it', '?'],
 ['tensorflow', 'is', 'awesome', '!'],
 ['it', 'is', 'good', 'framework', '.']]

In [14]:
for sentence in sentences:
    enc = tokenizer.encode(sentence)
    dec = tokenizer.decode(enc)
    print(f'{sentence} -> {enc} -> {dec}')

Hello, I don't know how to use it? -> [7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 16] -> hello , i do n't know how to use it ?
Tensorflow is awesome! -> [17, 6, 18, 19] -> tensorflow is awesome !
it is good framework. -> [5, 6, 20, 21, 22] -> it is good framework .


### Encode text for tensorflow's dataset

In [19]:
with Path("source.txt").open("w", encoding="utf-8") as f:
    for sentence in sentences:
        f.write(sentence + "\n")

ds = tf.data.TextLineDataset("source.txt")
ds

<TextLineDatasetV2 shapes: (), types: tf.string>

In [29]:
bos_id = tokenizer.vocab.word2idx(tokenizer.vocab.bos)

def encode(text):
    encoded_text = tokenizer.encode(text.numpy().decode())
    return np.concatenate(([bos_id], encoded_text))

def encoded_map_fn(text):
    encoded_text = tf.py_function(encode, inp=[text], Tout=(tf.int64))
    encoded_text.set_shape([None])
    return encoded_text

encoded_ds = ds.map(encoded_map_fn).shuffle(buffer_size=len(sentences)).padded_batch(3).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
encoded_ds

<PrefetchDataset shapes: (None, None), types: tf.int64>

In [31]:
for epoch in range(3):
    print("epoch :", epoch)
    for batch in encoded_ds:
        print("[padded batch]")
        print(batch)


epoch : 0
[padded batch]
tf.Tensor(
[[ 1  5  6 20 21 22  0  0  0  0  0  0]
 [ 1 17  6 18 19  0  0  0  0  0  0  0]
 [ 1  7  8  9 10 11 12 13 14 15  5 16]], shape=(3, 12), dtype=int64)
epoch : 1
[padded batch]
tf.Tensor(
[[ 1  7  8  9 10 11 12 13 14 15  5 16]
 [ 1 17  6 18 19  0  0  0  0  0  0  0]
 [ 1  5  6 20 21 22  0  0  0  0  0  0]], shape=(3, 12), dtype=int64)
epoch : 2
[padded batch]
tf.Tensor(
[[ 1  5  6 20 21 22  0  0  0  0  0  0]
 [ 1  7  8  9 10 11 12 13 14 15  5 16]
 [ 1 17  6 18 19  0  0  0  0  0  0  0]], shape=(3, 12), dtype=int64)


## Sentencepiece Tokenizer for Any language

In [41]:
tokenizer = SentencePieceTokenizer(lower=True, language="en")
sentences = [
    "Hello, I don't know how to use it?",
    "Tensorflow is awesome!",
    "it is good framework.",
]

source_file = Path("source.txt")
with source_file.open("w", encoding="utf-8") as f:
    for s in sentences:
        f.write(s + "\n")

tokenizer.train_sp(source_file, vocab_size=37)

!cat m.vocab

<pad>	0
<s>	0
</s>	0
<unk>	0
<mask>	0
▁	-1.85354
o	-2.41476
w	-2.69918
s	-2.92457
e	-2.94918
n	-3.28251
t	-3.74824
▁i	-3.74824
d	-3.78251
a	-3.78251
f	-3.78251
me	-3.78251
k	-3.78251
▁it	-3.81797
lo	-3.83735
or	-4.00886
r	-4.35741
l	-4.65398
!	-4.78251
'	-4.78251
,	-4.78251
.	-4.78251
?	-4.78251
H	-4.78251
I	-4.78251
g	-4.78251
h	-4.78251
u	-4.78251
T	-4.78251
m	-4.93682
i	-4.93692
so	-4.93692


In [42]:
bos_id = tokenizer.vocab.word2idx(tokenizer.vocab.bos)

with Path("source.txt").open("w", encoding="utf-8") as f:
    for sentence in sentences:
        f.write(sentence + "\n")
        print(tokenizer.tokenize(sentence))

ds = tf.data.TextLineDataset("source.txt")
ds

['▁', 'h', 'e', 'l', 'lo', ',', '▁i', '▁', 'd', 'o', 'n', "'", 't', '▁', 'k', 'n', 'o', 'w', '▁', 'h', 'o', 'w', '▁', 't', 'o', '▁', 'u', 's', 'e', '▁it', '?']
['▁', 't', 'e', 'n', 's', 'or', 'f', 'lo', 'w', '▁i', 's', '▁', 'a', 'w', 'e', 'so', 'me', '!']
['▁it', '▁i', 's', '▁', 'g', 'o', 'o', 'd', '▁', 'f', 'r', 'a', 'me', 'w', 'or', 'k', '.']


<TextLineDatasetV2 shapes: (), types: tf.string>

In [43]:
def encode(text):
    encoded_text = tokenizer.encode(text.numpy().decode())
    return np.concatenate(([bos_id], encoded_text))

def encoded_map_fn(text):
    encoded_text = tf.py_function(encode, inp=[text], Tout=(tf.int64))
    encoded_text.set_shape([None])
    return encoded_text

encoded_ds = ds.map(encoded_map_fn).shuffle(buffer_size=len(sentences)).padded_batch(3).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
encoded_ds

<PrefetchDataset shapes: (None, None), types: tf.int64>

In [44]:
for epoch in range(3):
    print("epoch :", epoch)
    for batch in encoded_ds:
        print("[padded batch]")
        print(batch)

epoch : 0
[padded batch]
tf.Tensor(
[[ 1  5 11  9 10  8 20 15 19  7 12  8  5 14  7  9 36 16 23  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1 18 12  8  5 30  6  6 13  5 15 21 14 16  7 20 17 26  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1  5 31  9 22 19 25 12  5 13  6 10 24 11  5 17 10  6  7  5 31  6  7  5
  11  6  5 32  8  9 18 27]], shape=(3, 32), dtype=int64)
epoch : 1
[padded batch]
tf.Tensor(
[[ 1  5 11  9 10  8 20 15 19  7 12  8  5 14  7  9 36 16 23  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1 18 12  8  5 30  6  6 13  5 15 21 14 16  7 20 17 26  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1  5 31  9 22 19 25 12  5 13  6 10 24 11  5 17 10  6  7  5 31  6  7  5
  11  6  5 32  8  9 18 27]], shape=(3, 32), dtype=int64)
epoch : 2
[padded batch]
tf.Tensor(
[[ 1 18 12  8  5 30  6  6 13  5 15 21 14 16  7 20 17 26  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1  5 31  9 22 19 25 12  5 13  6 10 24 11  5 17 10  6  7  5 31  6  7  5
  11  6  5 32  8  9 18 27]
 [ 1  5 11  9 10  8 20 1

## Sudachi Tokenizer for Japanese

In [36]:
sentences = ["銀座でランチをご一緒しましょう。", "締切間に合いますか？", "トークナイザを作りました。"]

tokenizer = SudachiTokenizer(language="ja")
vocab = Vocab()

for sentence in sentences:
    vocab.add_vocabs(tokenizer.tokenize(sentence))
vocab.build_vocab()

tokenizer.vocab = vocab
len(vocab)

23

In [40]:
bos_id = tokenizer.vocab.word2idx(tokenizer.vocab.bos)

with Path("source.txt").open("w", encoding="utf-8") as f:
    for sentence in sentences:
        f.write(sentence + "\n")
        print(tokenizer.tokenize(sentence))

ds = tf.data.TextLineDataset("source.txt")
ds


('銀座', 'で', 'ランチ', 'を', 'ご', '一緒', 'し', 'ましょう', '。')
('締切', '間に合い', 'ます', 'か', '?')
('トークナイザ', 'を', '作り', 'まし', 'た', '。')


<TextLineDatasetV2 shapes: (), types: tf.string>

In [34]:
def encode(text):
    encoded_text = tokenizer.encode(text.numpy().decode())
    return np.concatenate(([bos_id], encoded_text))

def encoded_map_fn(text):
    encoded_text = tf.py_function(encode, inp=[text], Tout=(tf.int64))
    encoded_text.set_shape([None])
    return encoded_text

encoded_ds = ds.map(encoded_map_fn).shuffle(buffer_size=len(sentences)).padded_batch(3).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
encoded_ds

<PrefetchDataset shapes: (None, None), types: tf.int64>

In [37]:
for epoch in range(3):
    print("epoch :", epoch)
    for batch in encoded_ds:
        print("[padded batch]")
        print(batch)

epoch : 0
[padded batch]
tf.Tensor(
[[ 1 19  5 20 21 22  6  0  0  0]
 [ 1  7  8  9  5 10 11 12 13  6]
 [ 1 14 15 16 17 18  0  0  0  0]], shape=(3, 10), dtype=int64)
epoch : 1
[padded batch]
tf.Tensor(
[[ 1  7  8  9  5 10 11 12 13  6]
 [ 1 19  5 20 21 22  6  0  0  0]
 [ 1 14 15 16 17 18  0  0  0  0]], shape=(3, 10), dtype=int64)
epoch : 2
[padded batch]
tf.Tensor(
[[ 1  7  8  9  5 10 11 12 13  6]
 [ 1 19  5 20 21 22  6  0  0  0]
 [ 1 14 15 16 17 18  0  0  0  0]], shape=(3, 10), dtype=int64)
