# Intialize

## Load all files

In [1]:
en_file1 = '../data/train.lang1.no-punctuation/train.lang1'
en_file2 = '../data/train.en.no-punctuation/unaligned.en'
fr_file1 = '../data/train.lang2.no-punctuation/train.lang2'
fr_file2 = '../data/train.fr.no-punctuation/unaligned.fr'
# TODO: exclude the test set

lang_files = [en_file1, en_file2, fr_file1, fr_file2]

# Tokenize

## Tokenize all files (whole strings)

In [2]:
from collections import Counter

# Add control tokens
my_counter = Counter()
for i in range(2):
    my_counter.update(["<START>", "<STOP>", "<UNK>", "<MASK>", "<SEP>", "<PAD>"])

In [3]:
def string_to_tokens(raw_string):
    return raw_string.split()

line_lengths = []
for lang_file in lang_files:
    with open(lang_file) as f:
        for line in f:
            tokens = string_to_tokens(line)
            line_lengths += [len(tokens)]
            for token in tokens:
                my_counter.update([token])    

## Keep tokens that occur more than once

In [4]:
import numpy as np
tokens = np.array(list(my_counter.keys()))
frequencies = np.array(list(my_counter.values()))

In [5]:
retained_tokens = tokens[frequencies > 1]

In [6]:
print(len(tokens))
print(len(retained_tokens))

127878
86384


## Create lookup table dict: string -> token

In [7]:
indices = np.array(range(len(retained_tokens)))

In [8]:
indices

array([    0,     1,     2, ..., 86381, 86382, 86383])

In [9]:
my_tokenizer_lut = dict(zip(retained_tokens,indices))

## Define functions for tokenizing, padding

In [10]:
def pad_tokens(token_list, max_length):
    if len(token_list) >= max_length:
        token_list = token_list[:max_length]
        token_list[(max_length-1)] = my_tokenizer_lut["<STOP>"]
    else:
        while len(token_list) < max_length:
            token_list = token_list + [my_tokenizer_lut["<PAD>"]]
    return token_list

def tokenize_string(raw_string, max_length=32): # TODO: Better definition of sentence length
    token_list = [my_tokenizer_lut["<START>"]]
    
    for token in string_to_tokens(raw_string):
        if token in my_tokenizer_lut:
            token_list += [my_tokenizer_lut[token]]
        else:
            token_list += [my_tokenizer_lut["<UNK>"]]
    
    token_list += [my_tokenizer_lut["<STOP>"]]
    
    token_list = pad_tokens(token_list, max_length)
    
    return np.array(token_list)[None,:]
        
tokenize_string("The quick brown fox jumped over the lazy dog.")

array([[   0,    2, 2018, 2659, 1169, 8195,   30,    9, 1244,    2,    1,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5]])

In [11]:
def tokenize_file(filename):
    with open(filename) as f:
        tokens = []
        for idx, line in enumerate(f):
            if idx < 2:
                print(line)
            tokens += [tokenize_string(line)]
    return np.concatenate(tokens,axis=0)

x_true = tokenize_file(en_file1)
x_true.shape

so too does the idea that accommodating religious differences is dangerous

mr president ladies and gentlemen the financial perspective outlines the scope of the eu ’s activities over coming years as well as providing a framework for such activities and determining how effective they will be



(11000, 32)

# Define masking strategy

In [12]:
def mask_tokens(true_tokens):
    non_pad_tokens = true_tokens != my_tokenizer_lut["<PAD>"]
    random_masking_seed = np.random.uniform(0,1,(32,)) * non_pad_tokens
    
    masking_targets = 0.85 < random_masking_seed # 15% # TODO: Use masking targets
    mask_token_targets = np.logical_and(0.85 < random_masking_seed, random_masking_seed < 0.85 + 0.15*0.8) # 80% of 15%
    random_token_targets = np.logical_and(1.0 - 0.1*0.15 < random_masking_seed, random_masking_seed < 1.0) # 10% of 15%
    
    masked_tokens = true_tokens.copy()
    masked_tokens[mask_token_targets] = my_tokenizer_lut["<MASK>"]
    masked_tokens[random_token_targets] = np.random.randint(0,len(my_tokenizer_lut),(random_token_targets.sum(),))
    
    return masked_tokens, masking_targets

x_train, targets_train = mask_tokens(x_true)
x_train[0]

array([ 0,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  1,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5])

In [387]:
attention_mask = x_train != my_tokenizer_lut["<PAD>"]
attention_mask[:2]

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True]])

In [29]:
x_true[:2]

array([[ 0,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  1,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5],
       [ 0, 17, 18, 19, 20, 21,  9, 22, 23, 24,  9, 25, 26,  9, 27, 28,
        29, 30, 31, 32, 33, 34, 33, 35, 36, 37, 38, 39, 29, 20, 40,  1]])

# Initialize and train Huggingface models

In [15]:
from transformers import BertConfig, TFBertForMaskedLM
config = BertConfig.from_pretrained('../code/bert_config_tiny.json')
config.vocab_size = len(my_tokenizer_lut)
model = TFBertForMaskedLM(config)

In [16]:
import tensorflow as tf
input_ids = tf.constant(tokenize_string("Hello, my dog is cute"))
outputs = model((input_ids,))
outputs

(<tf.Tensor: shape=(1, 32, 86384), dtype=float32, numpy=
 array([[[ 0.07780217, -0.24033514,  0.17138156, ..., -0.2960101 ,
          -0.05275426, -0.13692337],
         [-0.24117428,  0.09692038, -0.12350251, ...,  0.07272068,
          -0.11459488, -0.20949864],
         [-0.02552614, -0.14502458,  0.23920113, ..., -0.03539889,
          -0.23361361, -0.15744278],
         ...,
         [-0.05553946, -0.24148595,  0.06193926, ...,  0.0433122 ,
          -0.10836109,  0.03484118],
         [-0.01022419, -0.00660565, -0.02542009, ...,  0.02850357,
          -0.01954613,  0.09012558],
         [-0.03358864,  0.00769393,  0.03134035, ..., -0.09817515,
           0.09479681, -0.11208723]]], dtype=float32)>,)

In [17]:
len(my_tokenizer_lut)

86384

In [18]:
input_ids

<tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[   0,    2,   55, 1695,   15, 9325,    1,    5,    5,    5,    5,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5]])>

In [19]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss)
model.fit((x_train,attention_mask), x_true)

Train on 11000 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




<tensorflow.python.keras.callbacks.History at 0x7f7224127588>

# Check model performance

In [38]:
my_true_input = tokenize_string("I drove to the store")
my_input = tokenize_string("I drove to <MASK> store")
my_output = model(my_input)

In [31]:
my_input

array([[    0,     2, 10439,   119,     3,  4831,     1,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5]])

In [22]:
tf.argmax(my_output[0][0,4,:])
# tf.nn.softmax(my_output[0][0,4,:])

<tf.Tensor: shape=(), dtype=int64, numpy=5>

In [23]:
inv_tokenizer_lut = {v: k for k, v in my_tokenizer_lut.items()}

In [24]:
inv_tokenizer_lut[5]

'<PAD>'

In [25]:
my_tokenizer_lut['the']

9

In [26]:
my_output[0][0,4,:][9]

<tf.Tensor: shape=(), dtype=float32, numpy=1.1864281>

In [27]:
my_output[0][0,4,:][:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.9891175 ,  1.7110438 ,  0.6563533 ,  0.23294514, -1.4006706 ,
        2.721237  ,  0.6520706 ,  0.68161255,  0.46826303,  1.1864281 ],
      dtype=float32)>

In [28]:
tokens = tokenize_string("The quick brown fox jumped over the lazy dog.")
mask_tokens(tokens)

(array([[    0,     2, 81685,  2659,  1169,  8195,    30,     3,  1244,
             2,     1,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5]]),
 array([[False, False,  True, False, False, False, False,  True, False,
          True, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False,
         False, False, False, False, False]]))

# Override model to include masking

## Overriding functions

In [388]:
class bert_with_mask(tf.keras.Model):
    def __init__(self, config):
        super(bert_with_mask, self).__init__()
        self.bert = TFBertForMaskedLM(config)

    def call(self, inputs):
        mask = inputs[-1] # unpack mask from inputs
        inputs = inputs[:-1]
        outputs = self.bert(inputs)
        outputs = tf.nn.softmax(outputs, axis=2)
        masked_outputs = tf.where(mask[:,:,None], outputs, tf.constant(1.0)[None,None,None])
        return masked_outputs

## Define some test input

In [377]:
true_input = ["I drove to the store", "The best is yet to come"]
masked_input = ["I drove to <MASK> store", "<MASK> best is yet to come"]

true_tokens = []
for line in true_input:
    true_tokens += [tokenize_string(line)]
true_tokens = np.concatenate(true_tokens,axis=0)

masked_tokens = []
for line in masked_input:
    masked_tokens += [tokenize_string(line)]
masked_tokens = np.concatenate(masked_tokens,axis=0)
masked_tokens.shape

(2, 32)

In [378]:
tmp_attention_mask = masked_tokens != my_tokenizer_lut["<PAD>"]
tmp_attention_mask.shape

(2, 32)

In [379]:
target_masked_tokens = masked_tokens == my_tokenizer_lut["<MASK>"]
target_masked_tokens.shape

(2, 32)

## Define masking function

In [380]:
target_masked_tokens.shape

(2, 32)

In [381]:
pred_tokens.shape

TensorShape([2, 32, 86384])

In [382]:
masked_pred_tokens = tf.where(target_masked_tokens[:,:,None], pred_tokens, tf.constant(0.0)[None,None,None])

In [383]:
true_tokens

array([[    0,     2, 10439,   119,     9,  4831,     1,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5],
       [    0,     2,  1335,    15,   456,   119,   710,     1,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5]])

## Test the model

In [384]:
model2 = bert_with_mask(config)
model2.compile(optimizer=optimizer, loss=fancy_loss)

In [385]:
out = model2([masked_tokens,tmp_attention_mask,target_masked_tokens])

In [389]:
# Train on full dataset
model2.fit((x_train,attention_mask,targets_train), x_true)

Train on 11000 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




<tensorflow.python.keras.callbacks.History at 0x7f71a0480e48>

In [390]:
my_input = tokenize_string("<MASK> can't be choosers.")
my_output = model2((my_input,tf.ones(my_input.shape[:2], dtype=tf.bool)))

In [391]:
tf.nn.top_k(my_output[0][0,1,:], 5)

TopKV2(values=<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([3.2541888, 3.2484477, 2.751284 , 2.621221 , 2.257852 ],
      dtype=float32)>, indices=<tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 9,  1, 26, 20, 36], dtype=int32)>)

In [393]:
for i in [  9,   1,  26,  20, 36]:
    print(inv_tokenizer_lut[i])

the
<STOP>
of
and
a
