# Dynamic Noising (Masking) for Tensorflow 2.x

## 1. Import libraries

In [3]:
import sys
sys.path.append('..')

In [14]:
from pprint import pprint
from pathlib import Path
import re
import numpy as np
import neologdn
import tensorflow as tf

import meguru_tokenizer
from meguru_tokenizer.whitespace_tokenizer import WhitespaceTokenizer
from meguru_tokenizer.vocab import Vocab
from meguru_tokenizer.process.noise_tf import Noiser

"tenosrflow", tf.__version__, "tokenizer", meguru_tokenizer.__version__

('tenosrflow', '2.2.0', 'tokenizer', '0.1.0')

## 2. Setup dataset

In [15]:
sentences = [
    "Hello, I don't know how to use it?",
    "Tensorflow is awesome!",
    "It is good framework.",
]

with Path("source.txt").open("w", encoding="utf-8") as f:
    for sentence in sentences:
        f.write(sentence + "\n")



In [9]:
!cat source.txt

Hello, I don't know how to use it?
Tensorflow is awesome!
It is good framework.


## 3. Setup Tokenizer, Vocaburary

In [16]:
tokenizer = WhitespaceTokenizer(lower=True, language="en")
vocab = Vocab()

building vocaburary from the corpus

In [17]:
for sentence in sentences:
    vocab.add_vocabs(tokenizer.tokenize(sentence))
vocab.build_vocab(vocab_size=30)

tokenizer.vocab = vocab
noiser = Noiser(vocab=tokenizer.vocab)

vocab.w2i


{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 '<mask>': 4,
 'it': 5,
 'is': 6,
 'hello': 7,
 ',': 8,
 'i': 9,
 'do': 10,
 "n't": 11,
 'know': 12,
 'how': 13,
 'to': 14,
 'use': 15,
 '?': 16,
 'tensorflow': 17,
 'awesome': 18,
 '!': 19,
 'good': 20,
 'framework': 21,
 '.': 22}

Dump & load vocaburary

In [19]:
vocab.dump_vocab(Path("vocab.txt"))

vocab = Vocab()
vocab.load_vocab(Path("vocab.txt"))
vocab.w2i

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 '<mask>': 4,
 'it': 5,
 'is': 6,
 'hello': 7,
 ',': 8,
 'i': 9,
 'do': 10,
 "n't": 11,
 'know': 12,
 'how': 13,
 'to': 14,
 'use': 15,
 '?': 16,
 'tensorflow': 17,
 'awesome': 18,
 '!': 19,
 'good': 20,
 'framework': 21,
 '.': 22}

## 4. Setup Tensorflow Dataset

In [21]:
ds = tf.data.TextLineDataset("source.txt")
bos_id = tokenizer.vocab.word2idx(tokenizer.vocab.bos)
print("bos id ", bos_id)

bos id  1


## 5. Setup text enoder

80 % word masking (blanking)

In [24]:
def encode(text):
    encoded_text = tokenizer.encode(text.numpy().decode())
    encoded_text = noiser.noisy(encoded_text, drop_prob=0.0, blank_prob=0.8, sub_prob=0.0, shuffle_dist=0.0)
    print(
        "[noised]\n",
        encoded_text,
        "\n",
        text.numpy().decode(),
        "->",
        tokenizer.decode(np.concatenate(([bos_id], encoded_text))),
    )
    return np.concatenate(([bos_id], encoded_text))

def encoded_map_fn(text):
    encoded_text = tf.py_function(encode, inp=[text], Tout=(tf.int64))
    encoded_text.set_shape([None])
    return encoded_text

encoded_ds = ds.map(encoded_map_fn).padded_batch(3)

## Example

In [25]:
for epoch in range(3):
    print("epoch :", epoch)
    for batch in encoded_ds:
        print("[padded batch]")
        print(batch)

epoch : 0
[noised]
 [4 4 4 4 4 4 4 4 4 4 4] 
 Hello, I don't know how to use it? -> <s> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>
[noised]
 [4 4 4 4] 
 Tensorflow is awesome! -> <s> <mask> <mask> <mask> <mask>
[noised]
 [ 4  4  4 21  4] 
 It is good framework. -> <s> <mask> <mask> <mask> framework <mask>
[padded batch]
tf.Tensor(
[[ 1  4  4  4  4  4  4  4  4  4  4  4]
 [ 1  4  4  4  4  0  0  0  0  0  0  0]
 [ 1  4  4  4 21  4  0  0  0  0  0  0]], shape=(3, 12), dtype=int64)
epoch : 1
[noised]
 [ 4  4  4 10  4  4  4  4 15  4  4] 
 Hello, I don't know how to use it? -> <s> <mask> <mask> <mask> do <mask> <mask> <mask> <mask> use <mask> <mask>
[noised]
 [4 4 4 4] 
 Tensorflow is awesome! -> <s> <mask> <mask> <mask> <mask>
[noised]
 [4 4 4 4 4] 
 It is good framework. -> <s> <mask> <mask> <mask> <mask> <mask>
[padded batch]
tf.Tensor(
[[ 1  4  4  4 10  4  4  4  4 15  4  4]
 [ 1  4  4  4  4  0  0  0  0  0  0  0]
 [ 1  4  4  4  4  4  0  0  0  0  0  0]], shap