# AUTORES: Alejandro Moya, Isaac Reyes, José Guzmán

In [1]:
use strict;
use warnings;
use Data::Dump qw(dump);
use tokenizers;
use aliased 'tokenizers::models::wordlevel::trainer' => 'WordLevelTrainer';

In [2]:
my $wordLevel_tokenizer = new Tokenizer(model => WordLevel->new(unk_token => "[UNK]"));

Tokenizer=HASH(0x6c0e088)

In [3]:
my $trainer   = WordLevelTrainer->new(special_tokens => ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[NUM]"]);

tokenizers::models::wordlevel::trainer=HASH(0x6c0e8b0)

In [4]:
$wordLevel_tokenizer->with_normalizer(NormalizerSequence->new(
                                               [NFD->new(),
                                                Lowercase->new(),
                                                StripAccents->new(),
                                               ]));

tokenizers::normalizers::Sequence=HASH(0x6c184b0)

In [5]:
$wordLevel_tokenizer->with_pre_tokenizer(
             Whitespace->new(regex => '(?:##)?(?:\w+|[^\w\s]+)'));

tokenizers::pre_tokenizers::whitespace=HASH(0x6c0e640)

In [6]:
$wordLevel_tokenizer->with_post_processor(TemplateProcessing->new('[CLS] $A [SEP]',
                                                 '[CLS] $A [SEP] $B:1 [SEP]:1',
                                                 [['[CLS]', 1], ['[SEP]', 2]]));

tokenizers::processors::template::PostProcessor=HASH(0x692c620)

In [7]:
$wordLevel_tokenizer->with_decoder(WordPieceDecoder->new());

tokenizers::decoders::wordpiece=HASH(0x6910230)

In [8]:
$wordLevel_tokenizer->train_from_files(files => ['data/corpus.txt', 'data/corpus2.txt'], trainer => $trainer);

1

In [9]:
my $sentence = 'How complex is this?';

How complex is this?

In [10]:
my $encoding = $wordLevel_tokenizer->encode($sentence);

Encoding=HASH(0x6c26528)

In [11]:
print "@{$encoding->get_tokens()}\n";
print "@{$encoding->get_ids()}\n";
print $wordLevel_tokenizer->decode($encoding->get_ids(), 0);

[CLS] how [UNK] is [UNK] ? [SEP]
1 24 0 8 0 20 2
[CLS] how [UNK] is [UNK]? [SEP]

1

In [12]:
print dump $wordLevel_tokenizer->{model}{vocab_r};

{
  "0"  => "[UNK]",
  "1"  => "[CLS]",
  "2"  => "[SEP]",
  "3"  => "[PAD]",
  "4"  => "[MASK]",
  "5"  => "[NUM]",
  "6"  => ".",
  "7"  => "com",
  "8"  => "is",
  "9"  => "##plex",
  "10" => "##ter",
  "11" => "bet",
  "12" => "than",
  "13" => "##ated",
  "14" => "##for",
  "15" => "##iful",
  "16" => "##ing",
  "17" => "##le",
  "18" => "##mers",
  "19" => "##plic",
  "20" => "?",
  "21" => "are",
  "22" => "beaut",
  "23" => "hello",
  "24" => "how",
  "25" => "simp",
  "26" => "train",
  "27" => "trans",
  "28" => "u",
}

1

## Wordlevel Vocabulary

In [15]:
$wordLevel_tokenizer->{model}->save(folder => 'data/', prefix => 'temp-wordLevel');

ARRAY(0x6c55388)

## Loading a pretrained WordPiece model out of a WordLevel vocab.

In [16]:
my $vocab_path = 'data/formato-wordLevel-vocab.json';

my $wordPiece_tgt = new Tokenizer(model=>WordPiece->new(files=>$vocab_path, unk_token=>'[UNK]'));

$wordPiece_tgt->with_normalizer(NormalizerSequence->new([NFD->new(),
                                                               Lowercase->new(),
                                                               StripAccents->new()]));

$wordPiece_tgt->with_pre_tokenizer(PreTokenizerWhitespace->new());

$wordPiece_tgt->with_post_processor(TemplateProcessing->new('[CLS] $A [SEP]',
                                                             '[CLS] $A [SEP] $B:1 [SEP]:1',
                                                              [['[CLS]', 2], ['[SEP]', 3]]));

$wordPiece_tgt->with_decoder(WordPieceDecoder->new());

tokenizers::decoders::wordpiece=HASH(0x6c19948)

## Showing the WordPiece vocabulary

In [17]:
# print dump $wordPiece_tgt->{model}{vocab_r};
while (my ($key, $value) = each %{$wordPiece_tgt->{model}{vocab_r}}){
  printf "%s, %s\n", $key, $value;
}

0, [UNK]
1, [PAD]
2, [CLS]
3, [SEP]
4, [MASK]
5, .
6, com
7, is
8, ##plex
9, ##ter
10, bet
11, than
12, ##ated
13, ##for
14, ##iful
15, ##ing
16, ##le
17, ##mers
18, ##plic
19, ?
20, are
21, beaut
22, hello
23, how
24, simp
25, train
26, trans
27, u


0

In [18]:
$sentence = "Training Transformers is beautiful.";

$encoding = $wordPiece_tgt->encode($sentence, 1);

Encoding=HASH(0x6c68ae8)

In [19]:
print "@{$encoding->get_tokens()}\n";
print "@{$encoding->get_ids()}\n";
print $wordPiece_tgt->decode($encoding->get_ids(), 0);

[CLS] train ##ing trans ##for ##mers is beaut ##iful . [SEP]
2 25 15 26 13 17 7 21 14 5 3
[CLS] training transformers is beautiful. [SEP]

1

## Pretrained Transformer's WordPiece model

To be used without modification by the Transformer's Encoder (frozen side) only.

In [20]:
$vocab_path =  '~/.mxnet/models/WMT2014_src-230ebb81.vocab';

~/.mxnet/models/WMT2014_src-230ebb81.vocab

In [21]:
my $wordPiece_src = new Tokenizer(model=>WordPiece->new(files=>$vocab_path, unk_token=>'[UNK]', 
                                                        continuing_subword_suffix=>'@@'));

$wordPiece_src->with_normalizer(NormalizerSequence->new([NFD->new(),
                                                               Lowercase->new(),
                                                               StripAccents->new()]));

$wordPiece_src->with_pre_tokenizer(PreTokenizerWhitespace->new());

$wordPiece_src->with_post_processor(TemplateProcessing->new('[CLS] $A [SEP]',
                                                             '[CLS] $A [SEP] $B:1 [SEP]:1',
                                                              [['[CLS]', 2], ['[SEP]', 3]]));



tokenizers::processors::template::PostProcessor=HASH(0x6c63620)

In [22]:
$wordPiece_src->with_decoder(WordPieceDecoder->new(suffix=>'@@'));

tokenizers::decoders::wordpiece=HASH(0x6c60cd0)

In [23]:
# my $sentence = 'Academically sponsored';

In [24]:
$encoding = $wordPiece_src->encode($sentence, 1);

Encoding=HASH(0x7325a48)

In [25]:
print "@{$encoding->get_tokens()}\n";
print "@{$encoding->get_ids()}\n";
print $wordPiece_src->decode($encoding->get_ids(), 0);


[CLS] train@@ ing transform@@ ers is@@ beauti@@ ful .@@ [SEP]
2 29197 20762 29221 17283 21112 12971 18519 69 3
<bos> training transformers isbeautiful .<eos> 

1

In [26]:
print dump $wordPiece_src->{model}{vocab_r}

{
  # tied Tie::IxHash
  "0"     => "<unk>",
  "2"     => "<bos>",
  "3"     => "<eos>",
  "4"     => "!",
  "5"     => "\"",
  "6"     => "#",
  "7"     => "\$",
  "8"     => "%",
  "9"     => "&",
  "10"    => "'",
  "11"    => "'\@\@",
  "12"    => "'E\@\@",
  "13"    => "'d",
  "14"    => "'ll",
  "15"    => "'m",
  "16"    => "'re",
  "17"    => "'s",
  "18"    => "'t",
  "19"    => "'ve",
  "20"    => "(",
  "21"    => ")",
  "22"    => "*",
  "23"    => "+",
  "24"    => ",",
  "25"    => ",00",
  "26"    => ",000",
  "27"    => ",5",
  "28"    => ",\@\@",
  "29"    => "-",
  "30"    => "--",
  "31"    => "--------\@\@",
  "32"    => "----\@\@",
  "33"    => "--\@\@",
  "34"    => "-\@\@",
  "35"    => "-B\@\@",
  "36"    => "-Benz",
  "37"    => "-European",
  "38"    => "-F\@\@",
  "39"    => "-Fi",
  "40"    => "-Hotel",
  "41"    => "-S\@\@",
  "42"    => "-Seiten",
  "43"    => "-Spe\@\@",
  "44"    => "-Ster\@\@",
  "45"    => "-Sterne",
  "46"    => "-Sterne-\@\@",
  "47"

1

## Workaround on the ambiguity problem

In [27]:
$sentence = "Scho@@ ols urged to focus more on ma@@ ths , sp@@ elling and gram@@ mar";

Scho@@ ols urged to focus more on ma@@ ths , sp@@ elling and gram@@ mar

In [28]:
sub get_tokens_from_sentence{ # tokenize_nmt
  my ($tokenizer, $sentence) = @_;
  
  if($sentence !~ /\n/){
    return [split ' ', $sentence];
  }else{
    my @sentences = ();
    for my $sent (split /\n/, $sentence){
      push @sentences, [split ' ', $sent];
    }
    return \@sentences;
  }
}
sub get_tokens_from_ids{
  my ($tokenizer, $ids) = @_;
  return [map {$tokenizer->{model}{vocab_r}{$_}} @$ids];
}
sub get_ids_from_sentence{
  my ($tokenizer, $sentence) = @_;
  return [map {$tokenizer->{model}{vocab}{$_}} (split ' ', $sentence)];
}
sub get_words_from_ids{
  my ($tokenizer, $ids) = @_;
  my $sentence = join ' ', map {$tokenizer->{model}{vocab_r}{$_}} @$ids;
  $sentence =~ s/\@\@ //g;
  return $sentence;
}

In [29]:
my $sentence = "Scho@@ ols urged";
my $sentences = "Scho@@ ols urged\nto focus more on ma@@ ths, \nsp@@ elling and gram@@ mar";

print dump (get_tokens_from_sentence($wordPiece_src, $sentences));

[
  ["Scho\@\@", "ols", "urged"],
  ["to", "focus", "more", "on", "ma\@\@", "ths,"],
  ["sp\@\@", "elling", "and", "gram\@\@", "mar"],
]

1

In [30]:
print "@{get_tokens_from_sentence($wordPiece_src, $sentence)}\n";

Scho@@ ols urged


1

In [31]:
my $ids = get_ids_from_sentence($wordPiece_src, $sentence);
printf "%s\n", dump $ids;
printf "%s\n", dump get_tokens_from_ids($wordPiece_src, $ids);
printf "%s\n", dump get_words_from_ids($wordPiece_src, $ids);

[8583, 24002, 29881]
["Scho\@\@", "ols", "urged"]
"Schools urged"


1

## Creating a Data Iterator

In [32]:
use d2l;

In [33]:
#tensores
my $A = mx->nd->arange(stop => 20)->reshape([5, 4]);
my $B = mx->nd->arange(stop => 25)->reshape([5, 5]);
my $C = mx->nd->arange(stop => 30)->reshape([5, 6]);

<AI::MXNet::NDArray 5x6 @cpu(0)>

In [34]:
#data iterador
my $train_iter = new d2l::Data_Iter(tensors     => [$A, $B, $C], 
                                    batch_size  => 2, 
                                    shuffle     => 0, 
                                    num_workers => d2l->get_dataloader_workers,
                                    last_batch  => 'keep'); # keep|discard|rollover
print "num_batches: ", $train_iter->len();

num_batches: 3

1

In [35]:
# Let's get a batch
my $batch = $train_iter->next();

ARRAY(0x6c689c8)

In [36]:
# Let's print the tensors of a batch
my ($T1, $T2, $T3) = @$batch;
printf "T1: %sT2: %sT3: %s", $T1->aspdl, $T2->aspdl, $T3->aspdl;

T1: 
[
 [0 1 2 3]
 [4 5 6 7]
]
T2: 
[
 [0 1 2 3 4]
 [5 6 7 8 9]
]
T3: 
[
 [ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
]


1

In [37]:
# Let's iterate all batches.
for (my ($i, $batch, $T1, $T2, $T3) = 0; eval{ $batch = $train_iter->next(), ($T1, $T2, $T3) = @$batch}; $i++){
  printf "T1[$i]: %sT2[$i]: %sT3[$i]: %s", $T1->aspdl, $T2->aspdl, $T3->aspdl;
}

T1[0]: 
[
 [ 8  9 10 11]
 [12 13 14 15]
]
T2[0]: 
[
 [10 11 12 13 14]
 [15 16 17 18 19]
]
T3[0]: 
[
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
]
T1[1]: 
[
 [16 17 18 19]
]
T2[1]: 
[
 [20 21 22 23 24]
]
T3[1]: 
[
 [24 25 26 27 28 29]
]


### proc del voc expresiones regulares

In [39]:
my $file_path = 'data/temp-wordLevel-vocab.json';

open (FILE, "<", $file_path) or die "Cannot open file $file_path: $!";
local $/; # reads the whole file at once
my $lines = <FILE>;
close FILE;

1

In [40]:
print $lines;

{"type":"WordLevel","unk_token":"[UNK]",    "vocab" : {"[UNK]":0,"[CLS]":1,"[SEP]":2,"[PAD]":3,"[MASK]":4,"[NUM]":5,".":6,"com":7,"is":8,"##plex":9,"##ter":10,"bet":11,"than":12,"##ated":13,"##for":14,"##iful":15,"##ing":16,"##le":17,"##mers":18,"##plic":19,"?":20,"are":21,"beaut":22,"hello":23,"how":24,"simp":25,"train":26,"trans":27,"u":28}}

1

In [41]:
my $vocab_str = $1 if $lines =~ /"vocab" : \{(.*)\}\}$/;

"[UNK]":0,"[CLS]":1,"[SEP]":2,"[PAD]":3,"[MASK]":4,"[NUM]":5,".":6,"com":7,"is":8,"##plex":9,"##ter":10,"bet":11,"than":12,"##ated":13,"##for":14,"##iful":15,"##ing":16,"##le":17,"##mers":18,"##plic":19,"?":20,"are":21,"beaut":22,"hello":23,"how":24,"simp":25,"train":26,"trans":27,"u":28

In [42]:
print $vocab_str;

"[UNK]":0,"[CLS]":1,"[SEP]":2,"[PAD]":3,"[MASK]":4,"[NUM]":5,".":6,"com":7,"is":8,"##plex":9,"##ter":10,"bet":11,"than":12,"##ated":13,"##for":14,"##iful":15,"##ing":16,"##le":17,"##mers":18,"##plic":19,"?":20,"are":21,"beaut":22,"hello":23,"how":24,"simp":25,"train":26,"trans":27,"u":28

1

In [43]:
my @vocab = $vocab_str =~ /(".*?").*?(?:,|$)/gm;

"[UNK]""[CLS]""[SEP]""[PAD]""[MASK]""[NUM]"".""com""is""##plex""##ter""bet""than""##ated""##for""##iful""##ing""##le""##mers""##plic""?""are""beaut""hello""how""simp""train""trans""u"

In [44]:
map {print "$_\n"} @vocab;

"[UNK]"
"[CLS]"
"[SEP]"
"[PAD]"
"[MASK]"
"[NUM]"
"."
"com"
"is"
"##plex"
"##ter"
"bet"
"than"
"##ated"
"##for"
"##iful"
"##ing"
"##le"
"##mers"
"##plic"
"?"
"are"
"beaut"
"hello"
"how"
"simp"
"train"
"trans"
"u"


11111111111111111111111111111

In [45]:
while (my ($i, $token) = each @vocab){
  printf " $token, $i\n";
}

 "[UNK]", 0
 "[CLS]", 1
 "[SEP]", 2
 "[PAD]", 3
 "[MASK]", 4
 "[NUM]", 5
 ".", 6
 "com", 7
 "is", 8
 "##plex", 9
 "##ter", 10
 "bet", 11
 "than", 12
 "##ated", 13
 "##for", 14
 "##iful", 15
 "##ing", 16
 "##le", 17
 "##mers", 18
 "##plic", 19
 "?", 20
 "are", 21
 "beaut", 22
 "hello", 23
 "how", 24
 "simp", 25
 "train", 26
 "trans", 27
 "u", 28


0

In [46]:
my @reserved_tokens = ();
while (my ($i, $token) = each @vocab){
  next if $i == 0;
     #print "$token\n";
  if ($token =~ /\]"$/){
    push @reserved_tokens, $token;
  }else{
    last;
  }
}
print dump @reserved_tokens;

("\"[CLS]\"", "\"[SEP]\"", "\"[PAD]\"", "\"[MASK]\"", "\"[NUM]\"")

1

In [None]:
 sub load_data_nmt{ #@save

    my ($self, %args) = (shift, util->get_arguments(batch_size   => undef,

                                                    num_steps    => 10,

                                                    num_examples => 600,

                                                    min_freq     => 2,

                                                    is_train     => 0,

                                                    file_path    => "./fra.txt", \@_)); # Camino de las frases: 'data/newstest2014.tok.bpe.32000.src.en' o 'data/newstrain2014.tok.bpe.32000.ref.en'

    #Return the iterator and the vocabularies of the translation dataset.

    #load_data_nmt(batch_size=>batch_size, num_steps=>num_steps, num_examples=>600, min_freq=>2, is_train=>0)    

    my $text = d2l->preprocess_nmt(d2l->read_data_nmt($args{file_path}));

    my ($source, $target) = d2l->tokenize_nmt($text, $args{num_examples} // 600);

    my $src_vocab = new d2l::Vocab(tokens => $source, min_freq => $args{min_freq} // 2,
     # donde $src_vocab es equivalente a $wordPiece_src
                    reserved_tokens => ['<pad>', '<bos>', '<eos>']);

    my $tgt_vocab = new d2l::Vocab(tokens => $target, min_freq => $args{min_freq} // 2,

                    reserved_tokens => ['<pad>', '<bos>', '<eos>']); # donde $tgt_vocab corresponde a $wordPiece_tgt
    my ($src_array, $src_valid_len) = d2l->build_array_nmt($source, $src_vocab, $args{num_steps});

    my ($tgt_array, $tgt_valid_len) = d2l->build_array_nmt($target, $tgt_vocab, $args{num_steps});

    my $data_iter  = new d2l::Data_Iter(tensors     => [$src_array, $src_valid_len, $tgt_array, $tgt_valid_len], 

                                        batch_size  => $args{batch_size}, 

                                        shuffle     => $args{is_train},

                                        num_workers => $self->get_dataloader_workers());    

    return $data_iter, $src_vocab, $tgt_vocab; 

  }



  # Defined in Section Classic 9.5.4. Reading the Dataset

  sub truncate_pad{ #@save

    # Truncate or pad sequences.

    my ($self, $line, $num_steps, $padding_token) = @_;

    

    if (scalar(@$line) > $num_steps){

      return @$line[0 .. $num_steps-1]  # Truncate

    }

    return (@$line, ($padding_token) x ($num_steps - scalar(@$line)));  # Pad

  }

  

  # Defined in Section Classic 9.5.4. Reading the Dataset

  sub build_array_nmt{ #@save

    # Transform text sequences of machine translation into minibatches.

    my ($self, $lines, $vocab, $num_steps) = @_;

    

    $lines = [map {$vocab->getitem($lines->[$_])} 0 .. $#$lines];

    $lines = [map {[(@$_), $vocab->getitem('<eos>')]} @$lines];

    my $array = mx->nd->array([map {[$self->truncate_pad(

        $_, $num_steps, $vocab->getitem('<pad>'))]} @$lines]);

    my $valid_len = ($array != $vocab->getitem('<pad>'))->astype('int32')->sum(1)->squeeze();

    return ($array, $valid_len);

  }