# Intialize

## Load all files

In [1]:
import tensorflow as tf

en_file1 = '../data/train.lang1.no-punctuation/train.lang1'
en_file2 = '../data/train.en.no-punctuation/unaligned.en'
fr_file1 = '../data/train.lang2.no-punctuation/train.lang2'
fr_file2 = '../data/train.fr.no-punctuation/unaligned.fr'
# TODO: exclude the test set

lang_files = [en_file1, en_file2, fr_file1, fr_file2]

# Tokenize

## Tokenize all files (whole strings (TODO: substrings, capitalization))

In [2]:
from collections import Counter

# Add control tokens
my_counter = Counter()
for i in range(2):
    my_counter.update(["<START>", "<STOP>", "<UNK>", "<MASK>", "<SEP>", "<PAD>"])

In [3]:
def string_to_tokens(raw_string):
    return raw_string.split()

line_lengths = []
for lang_file in lang_files:
    with open(lang_file) as f:
        for line in f:
            tokens = string_to_tokens(line)
            line_lengths += [len(tokens)]
            for token in tokens:
                my_counter.update([token])    

## Keep tokens that occur more than once

In [4]:
import numpy as np
tokens = np.array(list(my_counter.keys()))
frequencies = np.array(list(my_counter.values()))

In [5]:
retained_tokens = tokens[frequencies > 1]

In [6]:
print(len(tokens))
print(len(retained_tokens))

127878
86384


## Create lookup table dict: string -> token

In [7]:
indices = np.array(range(len(retained_tokens)))

In [8]:
indices

array([    0,     1,     2, ..., 86381, 86382, 86383])

In [9]:
my_tokenizer_lut = dict(zip(retained_tokens,indices))

## Define functions for tokenizing, padding

In [10]:
def pad_tokens(token_list, max_length):
    if len(token_list) >= max_length:
        token_list = token_list[:max_length]
        token_list[(max_length-1)] = my_tokenizer_lut["<STOP>"]
    else:
        while len(token_list) < max_length:
            token_list = token_list + [my_tokenizer_lut["<PAD>"]]
    return token_list

def tokenize_string(raw_string, max_length=32): # TODO: Better definition of sentence length
    token_list = [my_tokenizer_lut["<START>"]]
    
    for token in string_to_tokens(raw_string):
        if token in my_tokenizer_lut:
            token_list += [my_tokenizer_lut[token]]
        else:
            token_list += [my_tokenizer_lut["<UNK>"]]
    
    token_list += [my_tokenizer_lut["<STOP>"]]
    
    token_list = pad_tokens(token_list, max_length)
    
    return np.array(token_list)[None,:]
        
tokenize_string("The quick brown fox jumped over the lazy dog.")

array([[   0,    2, 2018, 2659, 1169, 8195,   30,    9, 1244,    2,    1,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5]])

In [11]:
def tokenize_file(filename):
    with open(filename) as f:
        tokens = []
        for idx, line in enumerate(f):
            if idx < 2:
                print(line)
            tokens += [tokenize_string(line)]
    return np.concatenate(tokens,axis=0)

x_true = tokenize_file(en_file1)
x_true.shape

so too does the idea that accommodating religious differences is dangerous

mr president ladies and gentlemen the financial perspective outlines the scope of the eu ’s activities over coming years as well as providing a framework for such activities and determining how effective they will be



(11000, 32)

# Define masking strategy

In [12]:
def mask_tokens(true_tokens):
    non_pad_tokens = true_tokens != my_tokenizer_lut["<PAD>"]
    random_masking_seed = np.random.uniform(0,1,(32,)) * non_pad_tokens
    
    masking_targets = 0.85 < random_masking_seed # 15%
    mask_token_targets = np.logical_and(0.85 < random_masking_seed, random_masking_seed < 0.85 + 0.15*0.8) # 80% of 15%
    random_token_targets = np.logical_and(1.0 - 0.1*0.15 < random_masking_seed, random_masking_seed < 1.0) # 10% of 15%
    
    masked_tokens = true_tokens.copy()
    masked_tokens[mask_token_targets] = my_tokenizer_lut["<MASK>"]
    masked_tokens[random_token_targets] = np.random.randint(0,len(my_tokenizer_lut),(random_token_targets.sum(),))
#     masked_tokens[~mask_token_targets] = my_tokenizer_lut["<PAD>"]
    
    return masked_tokens, masking_targets

x_train, targets_train = mask_tokens(x_true)
x_train[0]

array([ 0,  6,  7,  8,  9,  3, 11,  3, 13, 14, 15, 16,  1,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5])

In [13]:
targets_train[:2]

array([[False, False, False, False, False,  True, False,  True, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False],
       [False, False, False, False, False,  True, False,  True, False,
        False, False, False, False, False, False, False, False,  True,
        False, False, False, False,  True, False, False, False,  True,
        False, False, False, False, False]])

In [14]:
attention_mask = x_train != my_tokenizer_lut["<PAD>"]
attention_mask[:2]

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True]])

In [15]:
x_true[:2]

array([[ 0,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  1,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5],
       [ 0, 17, 18, 19, 20, 21,  9, 22, 23, 24,  9, 25, 26,  9, 27, 28,
        29, 30, 31, 32, 33, 34, 33, 35, 36, 37, 38, 39, 29, 20, 40,  1]])

# Override model to include masking

In [22]:
from transformers import TFBertForMaskedLM, BertConfig

## Overriding functions

In [16]:
class bert_with_mask(tf.keras.Model):
    def __init__(self, config, onehot_mask):
        super(bert_with_mask, self).__init__()
        self.bert = TFBertForMaskedLM(config)
        self.onehot_mask = onehot_mask

    def call(self, inputs):
        mask = inputs[-1] # unpack mask from inputs
        inputs = inputs[:-1]
        outputs = self.bert(inputs)[0]
        
        outputs = tf.where(mask[:,:,None], outputs, self.onehot_mask[None,None,:])
        
        return (outputs,)

## Define masking function

In [17]:
onehot_mask = np.zeros(len(my_tokenizer_lut), dtype=np.float32)
onehot_mask[my_tokenizer_lut["<PAD>"]] = 1.0

In [18]:
x_true[~targets_train] = my_tokenizer_lut["<PAD>"]

## Test the model

In [23]:
config = BertConfig.from_pretrained('../code/bert_config_tiny.json')
config.vocab_size = len(my_tokenizer_lut)

In [24]:
model2 = bert_with_mask(config, onehot_mask)

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.1, epsilon=1e-07, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model2.compile(optimizer=optimizer, loss=loss)

In [27]:
# Train on full dataset
model2.fit((x_train,attention_mask,targets_train), x_true)

Train on 11000 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




<tensorflow.python.keras.callbacks.History at 0x7f3b0ee39240>

In [31]:
# my_input = tokenize_string("<MASK> can't be choosers.")
my_input = tokenize_string("I drove to <MASK> .")
my_output = model2((my_input, tf.ones(my_input.shape[:2], dtype=tf.bool) ))[0]

In [32]:
tf.nn.top_k(my_output[0,4,:], 10)
# tf.nn.top_k(my_output[0,1,:], 5)

TopKV2(values=<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([2.5499732, 1.7648752, 1.5802052, 1.561729 , 1.5576662, 1.4899919,
       1.4283481, 1.3566295, 1.3235406, 1.1286547], dtype=float32)>, indices=<tf.Tensor: shape=(10,), dtype=int32, numpy=array([  9,   1,  15, 119,  26, 108,  20,  11,  82, 178], dtype=int32)>)

In [34]:
inv_tokenizer_lut = {v: k for k, v in my_tokenizer_lut.items()}
for i in [  9,   1,  15, 119,  26, 108,  20,  11,  82, 178]:
    print(inv_tokenizer_lut[i])

the
<STOP>
is
to
of
in
and
that
are
not


# TODO: Define a dataset

In [None]:
ds = tf.data.Dataset.from_tensors(x_true)

In [None]:
def mask_tokens_tf(true_tokens):
    non_pad_tokens = true_tokens != my_tokenizer_lut["<PAD>"]
    random_masking_seed = np.random.uniform(0,1,(32,)) * non_pad_tokens
    
    masking_targets = 0.85 < random_masking_seed # 15%
    mask_token_targets = np.logical_and(0.85 < random_masking_seed, random_masking_seed < 0.85 + 0.15*0.8) # 80% of 15%
    random_token_targets = np.logical_and(1.0 - 0.1*0.15 < random_masking_seed, random_masking_seed < 1.0) # 10% of 15%
    
    masked_tokens = true_tokens.clone()
    masked_tokens[mask_token_targets] = my_tokenizer_lut["<MASK>"]
    masked_tokens[random_token_targets] = np.random.randint(0,len(my_tokenizer_lut),(random_token_targets.sum(),))
    
    return masked_tokens, masking_targets

In [None]:
ds.apply(mask_tokens_tf)