# Training WordLevel
### Por: Guzman Jose, Moya Alejandro, Reyes Isaac

In [1]:
use strict;
use warnings;
use Data::Dump qw(dump);
use tokenizers;
use aliased 'tokenizers::models::wordlevel::trainer' => 'WordLevelTrainer';

In [2]:
my $wordLevel_tokenizer = new Tokenizer(model => WordLevel->new(unk_token => "[UNK]"));

Tokenizer=HASH(0x6a29f38)

In [3]:
my $trainer   = WordLevelTrainer->new(special_tokens => ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[NUM]"]);

tokenizers::models::wordlevel::trainer=HASH(0x6a2b340)

In [4]:
$wordLevel_tokenizer->with_normalizer(NormalizerSequence->new(
                                               [NFD->new(),
                                                Lowercase->new(),
                                                StripAccents->new(),
                                               ]));

tokenizers::normalizers::Sequence=HASH(0x6a2dbd0)

In [5]:
$wordLevel_tokenizer->with_pre_tokenizer(
             Whitespace->new(regex => '(?:##)?(?:\w+|[^\w\s]+)'));

tokenizers::pre_tokenizers::whitespace=HASH(0x6a31cd8)

In [6]:
$wordLevel_tokenizer->with_post_processor(TemplateProcessing->new('[CLS] $A [SEP]',
                                                 '[CLS] $A [SEP] $B:1 [SEP]:1',
                                                 [['[CLS]', 1], ['[SEP]', 2]]));

tokenizers::processors::template::PostProcessor=HASH(0x67553c0)

In [7]:
$wordLevel_tokenizer->with_decoder(WordPieceDecoder->new());

tokenizers::decoders::wordpiece=HASH(0x67382b0)

In [8]:
$wordLevel_tokenizer->train_from_files(
    files => [
        'data/train1.txt',
        'data/train2.txt',
        'data/train3.txt',
        'data/train4.txt',
        'data/train5.txt',
        'data/train6.txt',
        'data/train7.txt',
        'data/train8.txt',
        'data/train9.txt',
        'data/train10.txt',
        'data/train11.txt',
        'data/train12.txt',
        'data/train13.txt',
        'data/train14.txt',
        'data/train15.txt'
    ], 
    trainer => $trainer
);


1

In [9]:
my $sentence = 'How complex is this?';

How complex is this?

In [10]:
my $encoding = $wordLevel_tokenizer->encode($sentence);

Encoding=HASH(0x6ac7a48)

In [11]:
print "@{$encoding->get_ids()}\n";
print $wordLevel_tokenizer->decode($encoding->get_ids(), 0);

1 240 0 1421 241 370 2
[CLS] how [UNK] is this? [SEP]

1

In [12]:
print dump $wordLevel_tokenizer->{model}{vocab_r};

{
  "0"    => "[UNK]",
  "1"    => "[CLS]",
  "2"    => "[SEP]",
  "3"    => "[PAD]",
  "4"    => "[MASK]",
  "5"    => "[NUM]",
  "6"    => "##the",
  "7"    => "##s",
  "8"    => ".",
  "9"    => ",",
  "10"   => "##to",
  "11"   => "##in",
  "12"   => "##of",
  "13"   => "##a",
  "14"   => "##ing",
  "15"   => "##and",
  "16"   => "##t",
  "17"   => "##i",
  "18"   => "##tion",
  "19"   => "##er",
  "20"   => "##al",
  "21"   => "##re",
  "22"   => "##be",
  "23"   => "##is",
  "24"   => "##ly",
  "25"   => "##on",
  "26"   => "##for",
  "27"   => "##ed",
  "28"   => "'",
  "29"   => "##that",
  "30"   => "the",
  "31"   => "##an",
  "32"   => "##it",
  "33"   => "##as",
  "34"   => "##con",
  "35"   => "s",
  "36"   => "##at",
  "37"   => "##with",
  "38"   => "##was",
  "39"   => "##de",
  "40"   => "##ter",
  "41"   => "##com",
  "42"   => "\"",
  "43"   => "##l",
  "44"   => "##ty",
  "45"   => "##ex",
  "46"   => "##men",
  "47"   => "##\"",
  "48"   => "##said",
  "49"   => "#

1

In [13]:
$wordLevel_tokenizer->{model}->save(folder => 'data/', prefix => 'temp-wordLevel');

ARRAY(0x6ef8350)

## Loading a WordPiece model from WordLevel vocab.

In [14]:
my $vocab_path = 'data/formato-vocab.json';

my $wordPiece_tgt = new Tokenizer(model=>WordPiece->new(files=>$vocab_path, unk_token=>'[UNK]'));

$wordPiece_tgt->with_normalizer(NormalizerSequence->new([NFD->new(),
                                                               Lowercase->new(),
                                                               StripAccents->new()]));

$wordPiece_tgt->with_pre_tokenizer(PreTokenizerWhitespace->new());

$wordPiece_tgt->with_post_processor(TemplateProcessing->new('[CLS] $A [SEP]',
                                                             '[CLS] $A [SEP] $B:1 [SEP]:1',
                                                              [['[CLS]', 2], ['[SEP]', 3]]));

$wordPiece_tgt->with_decoder(WordPieceDecoder->new());

tokenizers::decoders::wordpiece=HASH(0x6a32068)

In [15]:
while (my ($key, $value) = each %{$wordPiece_tgt->{model}{vocab_r}}){
  printf "%s, %s\n", $key, $value;
}

0, [UNK]
1, [PAD]
2, [CLS]
3, [SEP]
4, [MASK]
5, .
6, com
7, is
8, ##plex
9, ##ter
10, bet
11, than
12, ##ated
13, ##for
14, ##iful
15, ##ing
16, ##le
17, ##mers
18, ##plic
19, ?
20, are
21, beaut
22, hello
23, how
24, simp
25, train
26, trans
27, u


0

In [16]:
$sentence = "Training Transformers is beautiful.";

$encoding = $wordPiece_tgt->encode($sentence, 1);

Encoding=HASH(0x6acf078)

In [17]:
print "@{$encoding->get_tokens()}\n";
print "@{$encoding->get_ids()}\n";
print $wordPiece_tgt->decode($encoding->get_ids(), 0);

[CLS] train ##ing trans ##for ##mers is beaut ##iful . [SEP]
2 25 15 26 13 17 7 21 14 5 3
[CLS] training transformers is beautiful. [SEP]

1

## Pretrained Transformer's WordPiece model

In [18]:
$vocab_path =  '~/.mxnet/models/WMT2014_src-230ebb81.vocab';

~/.mxnet/models/WMT2014_src-230ebb81.vocab

In [19]:
my $wordPiece_src = new Tokenizer(model=>WordPiece->new(files=>$vocab_path, unk_token=>'[UNK]', 
                                                        continuing_subword_suffix=>'@@'));

$wordPiece_src->with_normalizer(NormalizerSequence->new([NFD->new(),
                                                               Lowercase->new(),
                                                               StripAccents->new()]));

$wordPiece_src->with_pre_tokenizer(PreTokenizerWhitespace->new());

$wordPiece_src->with_post_processor(TemplateProcessing->new('[CLS] $A [SEP]',
                                                             '[CLS] $A [SEP] $B:1 [SEP]:1',
                                                              [['[CLS]', 2], ['[SEP]', 3]]));



tokenizers::processors::template::PostProcessor=HASH(0x7197da8)

In [20]:
$wordPiece_src->with_decoder(WordPieceDecoder->new(suffix=>'@@'));

tokenizers::decoders::wordpiece=HASH(0x6c77a78)

In [21]:
my $sentence = 'Academically sponsored';

Academically sponsored

In [22]:
$encoding = $wordPiece_src->encode($sentence, 1);

Encoding=HASH(0x6c06558)

In [23]:
print "@{$encoding->get_tokens()}\n";
print "@{$encoding->get_ids()}\n";
print $wordPiece_src->decode($encoding->get_ids(), 0);

[CLS] academ@@ ically spon@@ sored [SEP]
2 11165 20360 27629 27450 3
<bos> academically sponsored <eos> 

1

## Creating a Data Iterator

In [24]:
use d2l;

In [25]:
my $A = mx->nd->arange(stop => 20)->reshape([5, 4]);
my $B = mx->nd->arange(stop => 25)->reshape([5, 5]);
my $C = mx->nd->arange(stop => 30)->reshape([5, 6]);

<AI::MXNet::NDArray 5x6 @cpu(0)>

In [26]:
my $train_iter = new d2l::Data_Iter(tensors     => [$A, $B, $C], 
                                    batch_size  => 2, 
                                    shuffle     => 0, 
                                    num_workers => d2l->get_dataloader_workers,
                                    last_batch  => 'keep'); # keep|discard|rollover
print "num_batches: ", $train_iter->len();

num_batches: 3

1

In [27]:
my $batch = $train_iter->next();

ARRAY(0x7102a48)

In [28]:
my ($T1, $T2, $T3) = @$batch;
printf "T1: %sT2: %sT3: %s", $T1->aspdl, $T2->aspdl, $T3->aspdl;

T1: 
[
 [0 1 2 3]
 [4 5 6 7]
]
T2: 
[
 [0 1 2 3 4]
 [5 6 7 8 9]
]
T3: 
[
 [ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
]


1

In [29]:
for (my ($i, $batch, $T1, $T2, $T3) = 0; eval{ $batch = $train_iter->next(), ($T1, $T2, $T3) = @$batch}; $i++){
  printf "T1[$i]: %sT2[$i]: %sT3[$i]: %s", $T1->aspdl, $T2->aspdl, $T3->aspdl;
}

T1[0]: 
[
 [ 8  9 10 11]
 [12 13 14 15]
]
T2[0]: 
[
 [10 11 12 13 14]
 [15 16 17 18 19]
]
T3[0]: 
[
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
]
T1[1]: 
[
 [16 17 18 19]
]
T2[1]: 
[
 [20 21 22 23 24]
]
T3[1]: 
[
 [24 25 26 27 28 29]
]


### Data Iterator

In [30]:
 sub load_data_nmt{
    my ($self, %args) = (shift, util->get_arguments(batch_size   => undef,
                                                    num_steps    => 10,
                                                    num_examples => 600,
                                                    min_freq     => 2,
                                                    is_train     => 0,
                                                    file_path    => "data/output1.txt", \@_));
    my $text = d2l->preprocess_nmt(d2l->read_data_nmt($args{file_path}));
    my ($source, $target) = d2l->tokenize_nmt($text, $args{num_examples} // 600);
    my $src_vocab = new d2l::Vocab(tokens => $source, min_freq => $args{min_freq} // 2,
                    reserved_tokens => ['<pad>', '<bos>', '<eos>']);
    my $tgt_vocab = new d2l::Vocab(tokens => $target, min_freq => $args{min_freq} // 2,
                    reserved_tokens => ['<pad>', '<bos>', '<eos>']);
    my ($src_array, $src_valid_len) = d2l->build_array_nmt($source, $src_vocab, $args{num_steps});
    my ($tgt_array, $tgt_valid_len) = d2l->build_array_nmt($target, $tgt_vocab, $args{num_steps});
    my $data_iter  = new d2l::Data_Iter(tensors     => [$src_array, $src_valid_len, $tgt_array, $tgt_valid_len], 
                                        batch_size  => $args{batch_size}, 
                                        shuffle     => $args{is_train},
                                        num_workers => $self->get_dataloader_workers());    
    return $data_iter, $src_vocab, $tgt_vocab; 
  }


In [31]:
my $batch = $train_iter->next();

ARRAY(0x6201708)

In [32]:
my ($T1, $T2, $T3) = @$batch;
printf "T1: %sT2: %sT3: %s", $T1->aspdl, $T2->aspdl, $T3->aspdl;

T1: 
[
 [0 1 2 3]
 [4 5 6 7]
]
T2: 
[
 [0 1 2 3 4]
 [5 6 7 8 9]
]
T3: 
[
 [ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
]


1