# Attention-based Neural Machine Translation

**Reference:** Luong, Thang, Hieu Pham and Christopher D. Manning. "Effective Approaches to Attention-based Neural Machine Translation." In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, pp. 1412-1421. 2015.

* https://www.aclweb.org/anthology/D15-1166/ (main paper reference)
* https://arxiv.org/abs/1508.04025 (alternative paper url)
* https://github.com/tensorflow/nmt (main code reference)
* https://www.tensorflow.org/beta/tutorials/text/nmt_with_attention (alternative code reference)
* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py:2449,2103 (attention implementation)

In [1]:
using Knet, Test, Base.Iterators, Printf, LinearAlgebra, Random, CuArrays, IterTools

## Code and data from previous projects

Please copy or include the following types and related functions from previous projects:
`Vocab`, `TextReader`, `MTData`, `Embed`, `Linear`, `mask!`, `loss`, `int2str`,
`bleu`.

In [2]:
struct Vocab
    w2i::Dict{String,Int}
    i2w::Vector{String}
    unk::Int
    eos::Int
    tokenizer
end

function Vocab(file::String; tokenizer=split, vocabsize=Inf, mincount=1, unk="<unk>", eos="<s>")
    word_count = Dict{String,Int}()
    w2i = Dict{String,Int}()
    i2w = Vector{String}()
    int_eos = get!(w2i, eos, 1+length(w2i))
    int_unk = get!(w2i, unk, 1+length(w2i))
    for line in eachline(file)
        line = tokenizer(line)
        for word in line
            if haskey(word_count, word)
                word_count[word] += 1
            else
                word_count[word] = 1
            end
        end
    end
    word_count = collect(word_count)
    sort!(word_count, rev=true, by=x->x[2])
    # constructing w2i
    for pair in word_count
        if pair[2] >= mincount
            get!(w2i, pair[1], 1+length(w2i))
            if length(w2i) >= vocabsize
                break
            end
        end
    end
    w2i_array = collect(w2i)
    sort!(w2i_array, by=x->x[2])
    for pair in w2i_array
        push!(i2w, pair[1])
    end
    return Vocab(w2i, i2w, int_unk, int_eos, tokenizer)
end

Vocab

In [3]:
struct TextReader
    file::String
    vocab::Vocab
end

function Base.iterate(r::TextReader, s=nothing)
    s == nothing && (s = open(r.file))
    if eof(s)
        close(s)
        return nothing
    end
    line = readline(s)
    line = r.vocab.tokenizer(line)
    line_inds = Int[]
    for word in line
        push!(line_inds, get(r.vocab.w2i, word, r.vocab.unk))
    end
    return line_inds, s
end

Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

In [4]:
struct Embed; w; end

function Embed(vocabsize::Int, embedsize::Int)
    return Embed(param(embedsize, vocabsize, atype = Knet.atype()))
end

function (l::Embed)(x)
    l.w[:,x]
end

In [5]:
struct Linear; w; b; end

function Linear(inputsize::Int, outputsize::Int)
    return Linear(param(outputsize, inputsize, atype = Knet.atype()), param0(outputsize, atype = Knet.atype()))
end

function (l::Linear)(x)
    l.w * x .+ l.b
end

In [6]:
function mask!(a,pad)
    num_rows, num_cols = size(a)
    for row in 1:num_rows
        for column in num_cols:-1:1
            if a[row, column] != pad || a[row, column-1] != pad
                break
            else
                a[row, column] = 0
            end
        end
    end
    return a
end

mask! (generic function with 1 method)

In [7]:
struct MTData
    src::TextReader        # reader for source language data
    tgt::TextReader        # reader for target language data
    batchsize::Int         # desired batch size
    maxlength::Int         # skip if source sentence above maxlength
    batchmajor::Bool       # batch dims (B,T) if batchmajor=false (default) or (T,B) if true.
    bucketwidth::Int       # batch sentences with length within bucketwidth of each other
    buckets::Vector        # sentences collected in separate arrays called buckets for each length range
    batchmaker::Function   # function that turns a bucket into a batch.
end

function MTData(src::TextReader, tgt::TextReader; batchmaker = arraybatch, batchsize = 128, maxlength = typemax(Int),
                batchmajor = false, bucketwidth = 10, numbuckets = min(128, maxlength ÷ bucketwidth))
    buckets = [ [] for i in 1:numbuckets ] # buckets[i] is an array of sentence pairs with similar length
    MTData(src, tgt, batchsize, maxlength, batchmajor, bucketwidth, buckets, batchmaker)
end

Base.IteratorSize(::Type{MTData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{MTData}) = Base.HasEltype()
Base.eltype(::Type{MTData}) = NTuple{2}

In [8]:
function Base.iterate(d::MTData, state=nothing)
    if state == nothing
        for i in 1:length(d.buckets)
            d.buckets[i] = []
        end
    end
      
    while true
        if state == nothing
            s_data,s_state = iterate(d.src)
            t_data,t_state = iterate(d.tgt)
        elseif state == "eof"
             #if there are no more half filled batches, return nothing
            for i in 1:length(d.buckets)
                if length(d.buckets[i]) > 0
                    bucket = d.buckets[i]
                    d.buckets[i] = []
                    return (d.batchmaker(d,bucket),"eof")
                end
            end
            
            #no more batches
            return nothing
        else
            src = iterate(d.src,state[1]) 
            tgt = iterate(d.tgt,state[2])
            
            #TextReader returns a "nothing" value when it finishes, so i need to turn it into a Tuple for 
            #my code to run as intended
            if src == nothing; src = (nothing,nothing);end
            if tgt == nothing; tgt = (nothing,nothing);end

            s_data,s_state = src
            t_data,t_state = tgt
        end
        
        
        #if we reached the end of the src or tgt files, return half filled buckets
        if (s_state == nothing) | (t_state ==  nothing)
            state = "eof"
            continue
        end
        
        #if source sentence above d.maxlength
        if length(s_data) > d.maxlength
            state = (s_state,t_state)
            continue
        #if src sentence larger than a condition specified by Deniz
        elseif length(s_data) >=  length(d.buckets)*d.bucketwidth
            
            push!(d.buckets[length(d.buckets)],(s_data,t_data))
            if length(d.buckets[length(d.buckets)]) == d.batchsize
                bucket = d.buckets[length(d.buckets)]
                d.buckets[length(d.buckets)] = []
                return (d.batchmaker(d,bucket),(s_state,t_state))
            end
        #of if src sentence in range
            
        else
            
            for i in 1:length(d.buckets)
                if length(s_data) in ((i-1)*d.bucketwidth+1):(i*d.bucketwidth)
                    push!(d.buckets[i],(s_data,t_data))
                    if length(d.buckets[i]) == d.batchsize
                        bucket = d.buckets[i]
                        d.buckets[i] = []
                        return (d.batchmaker(d,bucket),(s_state,t_state))
                    end
                end
            end
            
        end
        state = (s_state,t_state)
    end
end

In [9]:
function arraybatch(d::MTData, bucket)
    x = [] 
    y = []
    
    srclength = max([length(pair[1]) for pair in bucket]...)
    tgtlength = max([length(pair[2]) for pair in bucket]...)
    
    for pair in bucket
        src_sentence, tgt_sentence = pair
        src_sen = []
        tgt_sen = []
                
        tgt_eos = d.tgt.vocab.eos
        push!(tgt_sen, tgt_eos)
        for w in tgt_sentence
            push!(tgt_sen, w) 
        end
        while length(tgt_sen) != (tgtlength + 2)
            push!(tgt_sen, tgt_eos)
        end
        
        src_eos = d.src.vocab.eos
        eos_num = srclength - length(src_sentence)
        i = 0
        while i!= eos_num
            push!(src_sen, src_eos)
            i += 1
        end
        for w in src_sentence
            push!(src_sen, w) 
        end        
        push!(x, src_sen)
        push!(y, tgt_sen)
    end
    
    return Matrix(hcat(x...)'), Matrix(hcat(y...)')
end

arraybatch (generic function with 1 method)

In [10]:
# Utility to convert int arrays to sentence strings
function int2str(y,vocab)
    y = vec(y)
    ysos = findnext(w->!isequal(w,vocab.eos), y, 1)
    ysos == nothing && return ""
    yeos = something(findnext(isequal(vocab.eos), y, ysos), 1+length(y))
    join(vocab.i2w[y[ysos:yeos-1]], " ")
end

int2str (generic function with 1 method)

In [11]:
function bleu(s2s,d::MTData)
    d = MTData(d.src,d.tgt,batchsize=1)
    reffile = d.tgt.file
    hypfile,hyp = mktemp()
    for (x,y) in progress(collect(d))
        g = s2s(x)
        for i in 1:size(y,1)
            println(hyp, int2str(g[i,:], d.tgt.vocab))
        end
    end
    close(hyp)
    isfile("multi-bleu.perl") || download("https://github.com/moses-smt/mosesdecoder/raw/master/scripts/generic/multi-bleu.perl", "multi-bleu.perl")
    run(pipeline(`cat $hypfile`,`perl multi-bleu.perl $reffile`))
    return hypfile
end

bleu (generic function with 1 method)

In [12]:
function loss(model, data; average=true)
    loss = 0
    count = 0
    Σloss = 0
    Nloss = 0
    for (x, y) in data
        loss, count = model(x, y, average=false)
        Σloss += loss
        Nloss += count
    end
    
    if average
        return Σloss/Nloss
    else
        return Σloss, Nloss
    end
end

loss (generic function with 1 method)

## S2S: Sequence to sequence model with attention

In this project we will define, train and evaluate a sequence to sequence encoder-decoder
model with attention for Turkish-English machine translation. The model has two extra
fields compared to `S2S_v1`: the `memory` layer computes keys and values from the encoder,
the `attention` layer computes the attention vector for the decoder.

In [13]:
struct Memory; w; end

struct Attention; wquery; wattn; scale; end

struct S2S
    srcembed::Embed       # encinput(B,Tx) -> srcembed(Ex,B,Tx)
    encoder::RNN          # srcembed(Ex,B,Tx) -> enccell(Dx*H,B,Tx)
    memory::Memory        # enccell(Dx*H,B,Tx) -> keys(H,Tx,B), vals(Dx*H,Tx,B)
    tgtembed::Embed       # decinput(B,Ty) -> tgtembed(Ey,B,Ty)
    decoder::RNN          # tgtembed(Ey,B,Ty) . attnvec(H,B,Ty)[t-1] = (Ey+H,B,Ty) -> deccell(H,B,Ty)
    attention::Attention  # deccell(H,B,Ty), keys(H,Tx,B), vals(Dx*H,Tx,B) -> attnvec(H,B,Ty)
    projection::Linear    # attnvec(H,B,Ty) -> proj(Vy,B,Ty)
    dropout::Real         # dropout probability
    srcvocab::Vocab       # source language vocabulary
    tgtvocab::Vocab       # target language vocabulary
end

## Part 1. Model constructor

The `S2S` constructor takes the following arguments:
* `hidden`: size of the hidden vectors for both the encoder and the decoder
* `srcembsz`, `tgtembsz`: size of the source/target language embedding vectors
* `srcvocab`, `tgtvocab`: the source/target language vocabulary
* `layers=1`: number of layers
* `bidirectional=false`: whether the encoder is bidirectional
* `dropout=0`: dropout probability

Hints:
* You can find the vocabulary size with `length(vocab.i2w)`.
* If the encoder is bidirectional `layers` must be even and the encoder should have `layers÷2` layers.
* The decoder will use "input feeding", i.e. it will concatenate its previous output to its input. Therefore the input size for the decoder should be `tgtembsz+hidden`.
* Only `numLayers`, `dropout`, and `bidirectional` keyword arguments should be used for RNNs, leave everything else default.
* The memory parameter `w` is used to convert encoder states to keys. If the encoder is bidirectional initialize it to a `(hidden,2*hidden)` parameter, otherwise set it to the constant 1.
* The attention parameter `wquery` is used to transform the query, set it to the constant 1 for this project.
* The attention parameter `scale` is used to scale the attention scores before softmax, set it to a parameter of size 1.
* The attention parameter `wattn` is used to transform the concatenation of the decoder output and the context vector to the attention vector. It should be a parameter of size `(hidden,2*hidden)` if unidirectional, `(hidden,3*hidden)` if bidirectional.

In [14]:
function S2S(hidden::Int, srcembsz::Int, tgtembsz::Int, srcvocab::Vocab, tgtvocab::Vocab;
             layers=1, bidirectional=false, dropout=0)
    
    srcembed = Embed(length(srcvocab.i2w), srcembsz)
    encoder = RNN(srcembsz, hidden, numLayers=(bidirectional ? Integer(layers/2) : layers), bidirectional=bidirectional, dropout=dropout, dataType=Float32, usegpu=(gpu()>=0))
    memory = (bidirectional ? Memory(param(hidden,2*hidden)) : Memory(1))
    tgtembed = Embed(length(tgtvocab.i2w), tgtembsz)
    decoder = RNN(tgtembsz+hidden, hidden, numLayers=layers, dropout=dropout, dataType=Float32, usegpu=(gpu()>=0))
    attention = Attention(1, (bidirectional ? param(hidden,3*hidden) : param(hidden,2*hidden)), param(1))
    projection = Linear(hidden, length(tgtvocab.i2w))
    
    return S2S(srcembed, encoder, memory, tgtembed, decoder, attention,
        projection, dropout, srcvocab, tgtvocab)
end

S2S

## Load pretrained model and data

We will load a pretrained model (16.20 bleu) for code testing.  The data should be loaded
with the vocabulary from the pretrained model for word id consistency.

In [15]:
if !isdefined(Main, :pretrained) || pretrained === nothing
    @info "Loading reference model"
    isfile("s2smodel.jld2") || download("http://people.csail.mit.edu/deniz/comp542/s2smodel.jld2","s2smodel.jld2")
    pretrained = Knet.load("s2smodel.jld2","model")
end
datadir = "datasets/tr_to_en"
if !isdir(datadir)
    @info "Downloading data"
    download("http://www.phontron.com/data/qi18naacl-dataset.tar.gz", "qi18naacl-dataset.tar.gz")
    run(`tar xzf qi18naacl-dataset.tar.gz`)
end
if !isdefined(Main, :tr_vocab)
    BATCHSIZE, MAXLENGTH = 64, 50
    @info "Reading data"
    tr_vocab = pretrained.srcvocab # Vocab("$datadir/tr.train", mincount=5)
    en_vocab = pretrained.tgtvocab # Vocab("$datadir/en.train", mincount=5)
    tr_train = TextReader("$datadir/tr.train", tr_vocab)
    en_train = TextReader("$datadir/en.train", en_vocab)
    tr_dev = TextReader("$datadir/tr.dev", tr_vocab)
    en_dev = TextReader("$datadir/en.dev", en_vocab)
    tr_test = TextReader("$datadir/tr.test", tr_vocab)
    en_test = TextReader("$datadir/en.test", en_vocab)
    dtrn = MTData(tr_train, en_train, batchsize=BATCHSIZE, maxlength=MAXLENGTH)
    ddev = MTData(tr_dev, en_dev, batchsize=BATCHSIZE)
    dtst = MTData(tr_test, en_test, batchsize=BATCHSIZE)
end

┌ Info: Loading reference model
└ @ Main In[15]:2
┌ Info: Reading data
└ @ Main In[15]:14


MTData(TextReader("datasets/tr_to_en/tr.test", Vocab(Dict("dev" => 1277,"komuta" => 13566,"ellisi" => 25239,"adresini" => 22820,"yüzeyi" => 4051,"paris'te" => 9494,"kafamdaki" => 18790,"yüzeyinde" => 5042,"geçerlidir" => 6612,"kökten" => 7774…), ["<s>", "<unk>", ".", ",", "bir", "ve", "bu", "''", "``", "için"  …  "seçmemiz", "destekleyip", "karşılaştırılabilir", "ördeğin", "gününüzü", "bağışçı", "istismara", "yaşça", "tedci", "fakültesi'nde"], 2, 1, split)), TextReader("datasets/tr_to_en/en.test", Vocab(Dict("middle-income" => 13398,"photosynthesis" => 7689,"polarizing" => 17881,"henry" => 4248,"abducted" => 15691,"rises" => 6225,"hampshire" => 13888,"whiz" => 16835,"cost-benefit" => 13137,"progression" => 5549…), ["<s>", "<unk>", ",", ".", "the", "and", "to", "of", "a", "that"  …  "archaea", "handshake", "brit", "wiper", "heroines", "coca", "exceptionally", "gallbladder", "autopsies", "linguistics"], 2, 1, split)), 64, 9223372036854775807, false, 10, Array{Any,1}[[], [], [], [], [], [

In [16]:
@testset "Testing S2S constructor" begin
    H,Ex,Ey,Vx,Vy,L,Dx,Pdrop = 8,9,10,length(dtrn.src.vocab.i2w),length(dtrn.tgt.vocab.i2w),2,2,0.2
    m = S2S(H,Ex,Ey,dtrn.src.vocab,dtrn.tgt.vocab;layers=L,bidirectional=(Dx==2),dropout=Pdrop)
    @test size(m.srcembed.w) == (Ex,Vx)
    @test size(m.tgtembed.w) == (Ey,Vy)
    @test m.encoder.inputSize == Ex
    @test m.decoder.inputSize == Ey + H
    @test m.encoder.hiddenSize == m.decoder.hiddenSize == H
    @test m.encoder.direction == Dx-1
    @test m.encoder.numLayers == (Dx == 2 ? L÷2 : L)
    @test m.decoder.numLayers == L
    @test m.encoder.dropout == m.decoder.dropout == Pdrop
    @test size(m.projection.w) == (Vy,H)
    @test size(m.memory.w) == (Dx == 2 ? (H,2H) : ())
    @test m.attention.wquery == 1
    @test size(m.attention.wattn) == (Dx == 2 ? (H,3H) : (H,2H))
    @test size(m.attention.scale) == (1,)
    @test m.srcvocab === dtrn.src.vocab
    @test m.tgtvocab === dtrn.tgt.vocab
end

[37m[1mTest Summary:           | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing S2S constructor | [32m  16  [39m[36m   16[39m


Test.DefaultTestSet("Testing S2S constructor", Any[], 16, false)

## Part 2. Memory

The memory layer turns the output of the encoder to a pair of tensors that will be used as
keys and values for the attention mechanism. Remember that the encoder RNN output has size
`(H*D,B,Tx)` where `H` is the hidden size, `D` is 1 for unidirectional, 2 for
bidirectional, `B` is the batchsize, and `Tx` is the sequence length. It will be
convenient to store these values in batch major form for the attention mechanism, so
*values* in memory will be a permuted copy of the encoder output with size `(H*D,Tx,B)`
(see `@doc permutedims`). The *keys* in the memory need to have the same first dimension
as the *queries* (i.e. the decoder hidden states). So *values* will be transformed into
*keys* of size `(H,B,Tx)` with `keys = m.w * values` where `m::Memory` is the memory
layer. Note that you will have to do some reshaping to 2-D and back to 3-D for matrix
multiplications. Also note that `m.w` may be a scalar such as `1` e.g. when `D=1` and we
want keys and values to be identical.

In [17]:
function (m::Memory)(x)
    v = permutedims(x,(1,3,2)) # H*D,Tx,B
    k = m.w == 1 ? v : reshape(m.w * reshape(v,size(v,1),:), (:, size(v)[2:end]...))
    return k, v
end

You can use the following helper function for scaling and linear transformations of 3-D tensors:

In [18]:
mmul(w,x) = (w == 1 ? x : w == 0 ? 0 : reshape(w * reshape(x,size(x,1),:), (:, size(x)[2:end]...)))

mmul (generic function with 1 method)

In [19]:
@testset "Testing memory" begin
    H,D,B,Tx = pretrained.encoder.hiddenSize, pretrained.encoder.direction+1, 4, 5
    x = KnetArray(randn(Float32,H*D,B,Tx))
    k,v = pretrained.memory(x)
    @test v == permutedims(x,(1,3,2))
    @test k == mmul(pretrained.memory.w, v)
end

┌ Info: Building the CUDAnative run-time library for your sm_70 device, this might take a while...
└ @ CUDAnative /kuacc/users/mabdullatif18/.julia/packages/CUDAnative/3Jwj2/src/compiler/rtlib.jl:188


[37m[1mTest Summary:  | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing memory | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing memory", Any[], 2, false)

## Part 3. Encoder

`encode()` takes a model `s` and a source language minibatch `src`. It passes the input
through `s.srcembed` and `s.encoder` layers with the `s.encoder` RNN hidden states
initialized to `0` in the beginning, and copied to the `s.decoder` RNN at the end. The
steps so far are identical to `S2S_v1` but there is an extra step: The encoder output is
passed to the `s.memory` layer which returns a `(keys,values)` pair. `encode()` returns
this pair to be used later by the attention mechanism.

In [20]:
function encode(s::S2S, src)
    source_embedding = s.srcembed(src)
    s.encoder.h = 0
    s.encoder.c = 0
    enc_out = s.encoder(source_embedding)
    s.decoder.h = s.encoder.h
    s.decoder.c = s.encoder.c
    k, v = s.memory(enc_out)
    return k, v
end

encode (generic function with 1 method)

In [21]:
@testset "Testing encoder" begin
    src1,tgt1 = first(dtrn)
    key1,val1 = encode(pretrained, src1)
    H,D,B,Tx = pretrained.encoder.hiddenSize, pretrained.encoder.direction+1, size(src1,1), size(src1,2)
    @test size(key1) == (H,Tx,B)
    @test size(val1) == (H*D,Tx,B)
    @test (pretrained.decoder.h,pretrained.decoder.c) === (pretrained.encoder.h,pretrained.encoder.c)
    @test norm(key1) ≈ 1214.4755f0
    @test norm(val1) ≈ 191.10411f0
    @test norm(pretrained.decoder.h) ≈ 48.536964f0
    @test norm(pretrained.decoder.c) ≈ 391.69028f0
end

[37m[1mTest Summary:   | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing encoder | [32m   7  [39m[36m    7[39m


Test.DefaultTestSet("Testing encoder", Any[], 7, false)

In [22]:
# The attention parameter wquery is used to transform the query, set it to the constant 1 for this project.

# The attention parameter wattn is used to transform the concatenation of the decoder output 
#     and the context vector to the attention vector. It should be 
#     a parameter of size (hidden,2*hidden) if unidirectional, (hidden,3*hidden) if bidirectional.

# The attention parameter scale is used to scale the attention scores before softmax, 
#     set it to a parameter of size 1.

## Part 4. Attention

The attention layer takes `cell`: the decoder output, and `mem`: a pair of (keys,vals)
from the encoder, and computes and returns the attention vector. First `a.wquery` is used
to linearly transform the cell to the query tensor. The query tensor is reshaped and/or
permuted as appropriate and multiplied with the keys tensor to compute the attention
scores. Please see `@doc bmm` for the batched matrix multiply operation used for this
step. The attention scores are scaled using `a.scale` and normalized along the time
dimension using `softmax`. After the appropriate reshape and/or permutation, the scores
are multiplied with the `vals` tensor (using `bmm` again) to compute the context
tensor. After the appropriate reshape and/or permutation the context vector is
concatenated with the cell and linearly transformed to the attention vector using
`a.wattn`. Please see the paper and code examples for details.

Note: the paper mentions a final `tanh` transform, however the final version of the
reference code does not use `tanh` and gets better results. Therefore we will skip `tanh`.

In [23]:
function (a::Attention)(cell, mem) # deccell(H,B,Ty), ( keys(H,Tx,B), vals(Dx*H,Tx,B) ) -> attnvec(H,B,Ty)
    if a.wquery == 1
        query = cell
    else
        cell_reshaped = reshape(cell, (size(cell,1), size(cell,2)*size(cell,3))) # H, B*Ty
        query = a.wquery * cell_reshaped
        query = reshape(query, (size(query,1), size(cell,2), size(cell,3)))
    end
    k, v = mem
    attn_scores = a.scale .* bmm(permutedims(query, (3,1,2)), k) # Ty,H,B # H,Tx,B --> Ty, Tx, B
    attn_scores = softmax(attn_scores, dims = 2) 
    cntxt = bmm(v, permutedims(attn_scores, (2,1,3))) # Dx*H,Tx,B # Tx, Ty, B  --> Dx*H, Ty, B
    attn_vec = a.wattn * reshape(vcat(cell, permutedims(cntxt, (1,3,2))), (:, size(cell,2)*size(cell,3))) # H, 3H  # 3H, B*Ty
    attn_vec = reshape(attn_vec, (size(cell,1), size(cell,2), size(cell,3)))
    return attn_vec # H,B,Ty
end

In [24]:
@testset "Testing attention" begin
    src1,tgt1 = first(dtrn)
    key1,val1 = encode(pretrained, src1)
    H,B = pretrained.encoder.hiddenSize, size(src1,1)
    Knet.seed!(1)
    x = KnetArray(randn(Float32,H,B,5))
    y = pretrained.attention(x, (key1, val1))
    @test size(y) == size(x)
    @test norm(y) ≈ 808.381f0
end

[37m[1mTest Summary:     | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing attention | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing attention", Any[], 2, false)

## Part 5. Decoder

`decode()` takes a model `s`, a target language minibatch `tgt`, the memory from the
encoder `mem` and the decoder output from the previous time step `prev`. After the input
is passed through the embedding layer, it is concatenated with `prev` (this is called
input feeding). The resulting tensor is passed through `s.decoder`. Finally the
`s.attention` layer takes the decoder output and the encoder memory to compute the
"attention vector" which is returned by `decode()`.

In [25]:
function decode(s::S2S, tgt, mem, prev) # tgtembed(Ey,B,Ty) . attnvec(H,B,Ty)[t-1] = (Ey+H,B,Ty) -> deccell(H,B,Ty)
#     @show prev
    tgt_embedding = s.tgtembed(tgt)
    dec_in = vcat(tgt_embedding, prev)
    deccell = s.decoder(dec_in)
    attn_vec = s.attention(deccell, mem)
#     @show attn_vec
end

decode (generic function with 1 method)

In [26]:
@testset "Testing decoder" begin
    src1,tgt1 = first(dtrn)
    key1,val1 = encode(pretrained, src1)
    H,B = pretrained.encoder.hiddenSize, size(src1,1)
    Knet.seed!(1)
    cell = randn!(similar(key1, size(key1,1), size(key1,3), 1))
    cell = decode(pretrained, tgt1[:,1:1], (key1,val1), cell)
    @test size(cell) == (H,B,1)
    @test norm(cell) ≈ 131.21631f0
end

[37m[1mTest Summary:   | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing decoder | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing decoder", Any[], 2, false)

## Part 6. Loss

The loss function takes source language minibatch `src`, and a target language minibatch
`tgt` and returns `sumloss/numwords` if `average=true` or `(sumloss,numwords)` if
`average=false` where `sumloss` is the total negative log likelihood loss and `numwords` is
the number of words predicted (including a final eos for each sentence). The source is first
encoded using `encode` yielding a `(keys,vals)` pair (memory). Then the decoder is called to
predict each word of `tgt` given the previous word, `(keys,vals)` pair, and the previous
decoder output. The previous decoder output is initialized with zeros for the first
step. The output of the decoder at each step is passed through the projection layer giving
word scores. Losses can be computed from word scores and masked/shifted `tgt`.

In [28]:
function (s::S2S)(src, tgt; average=true) # src(B, Tx), tgt(B, Ty)
    mem = encode(s, src)
    prev = KnetArray{Float32}(zeros(size(mem[1],1), size(src,1), 1))
    all_scores = []
    for i in 1:size(tgt, 2)-1
        prev = decode(s, tgt[:,i:i], mem, prev) # H, B, 1
        scores = s.projection(reshape(prev, size(prev)[1:2])) # H, B --> V, B
        push!(all_scores, scores)
    end
    all_scores = hcat(all_scores...) # V, Ty*B
    answers = tgt[:, 2:end]
    mask!(answers, s.tgtvocab.eos)
    answers = reshape(answers, :)
    return nll(all_scores, answers, average=average)
end

In [29]:
@testset "Testing loss" begin
    src1,tgt1 = first(dtrn)
    @test pretrained(src1,tgt1) ≈ 1.4666592f0
    @test pretrained(src1,tgt1,average=false) == (1949.1901f0, 1329)
end

[37mTesting loss: [39m[91m[1mTest Failed[22m[39m at [39m[1mIn[29]:4[22m
  Expression: pretrained(src1, tgt1, average=false) == (1949.1901f0, 1329)
   Evaluated: (1949.1897f0, 1329) == (1949.1901f0, 1329)
Stacktrace:
 [1] top-level scope at [1mIn[29]:4[22m
 [2] top-level scope at [1m/buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.2/Test/src/Test.jl:1113[22m
 [3] top-level scope at [1mIn[29]:2[22m
[37m[1mTest Summary: | [22m[39m[32m[1mPass  [22m[39m[91m[1mFail  [22m[39m[36m[1mTotal[22m[39m
Testing loss  | [32m   1  [39m[91m   1  [39m[36m    2[39m


TestSetException: Some tests did not pass: 1 passed, 1 failed, 0 errored, 0 broken.

## Part 7. Greedy translator

An `S2S` object can be called with a single argument (source language minibatch `src`, with
size `B,Tx`) to generate translations (target language minibatch with size `B,Ty`). The
keyword argument `stopfactor` determines how much longer the output can be compared to the
input. Similar to the loss function, the source minibatch is encoded yield a `(keys,vals)`
pair (memory). We generate the output one time step at a time by calling the decoder with
the last output, the memory, and the last decoder state. The last output is initialized to
an array of `eos` tokens and the last decoder state is initialized to an array of
zeros. After computing the scores for the next word using the projection layer, the highest
scoring words are selected and appended to the output. The generation stops when all outputs
in the batch have generated `eos` or when the length of the output is `stopfactor` times the
input.

In [30]:
function (s::S2S)(src; stopfactor = 3) # src(B, Tx)
    B = size(src, 1) # batch size
    eos_ind = s.tgtvocab.eos # eos token (we need it for the stopping condition)
    src_len = size(src, 2) # source maximum sentence length
    mem = encode(s, src)
    prev = KnetArray{Float32}(zeros(size(mem[1],1), size(src,1), 1))
    tgtvec = repeat([eos_ind], B, 1)
    tgt = []
    # stopping condition:
    # 1- size(tgt, 2) >= stopfactor * src_len
    # 2- if all batches have generate eos 
    eos_generated = zeros(B) # zero if hasn't generated eos, one if eos has been generated
    while true # for each decoder timestep
        prev = decode(s, tgtvec, mem, prev) # H, B, 1
        # feed it to the projection layer and get scores over vocab # dims: V, B
        scores = s.projection(reshape(prev, size(prev)[1:2])) # H, B --> V, B
        # get the index of the maximum score for each batch element # dims: B
        tgtvec = reshape(map(x -> x[1], argmax(scores, dims=1)), (:,1))
        eoses = findall(x->x==eos_ind, tgtvec)
        eos_generated[eoses] .= 1
        push!(tgt, tgtvec)
        if (length(tgt) >= stopfactor * src_len) || all(eos_generated .== 1)
            break
        end
    end
    tgt = hcat(tgt...)
    return tgt # dims: B, Ty
end

In [31]:
@testset "Testing translator" begin
    src1,tgt1 = first(dtrn)
    tgt2 = pretrained(src1)
    @test size(tgt2) == (64, 41)
    @test tgt2[1:3,1:3] == [14 25 10647; 37 25 1426; 27 5 349]
end

[37m[1mTest Summary:      | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing translator | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing translator", Any[], 2, false)

## Part 8. Training

`trainmodel` creates, trains and returns an `S2S` model. The arguments are described in
comments.

In [32]:
function trainmodel(trn,                  # Training data
                    dev,                  # Validation data, used to determine the best model
                    tst...;               # Zero or more test datasets, their loss will be periodically reported
                    bidirectional = true, # Whether to use a bidirectional encoder
                    layers = 2,           # Number of layers (use `layers÷2` for a bidirectional encoder)
                    hidden = 512,         # Size of the hidden vectors
                    srcembed = 512,       # Size of the source language embedding vectors
                    tgtembed = 512,       # Size of the target language embedding vectors
                    dropout = 0.2,        # Dropout probability
                    epochs = 0,           # Number of epochs (one of epochs or iters should be nonzero for training)
                    iters = 0,            # Number of iterations (one of epochs or iters should be nonzero for training)
                    bleu = false,         # Whether to calculate the BLEU score for the final model
                    save = false,         # Whether to save the final model
                    seconds = 60,         # Frequency of progress reporting
                    )
    @show bidirectional, layers, hidden, srcembed, tgtembed, dropout, epochs, iters, bleu, save; flush(stdout)
    model = S2S(hidden, srcembed, tgtembed, trn.src.vocab, trn.tgt.vocab;
                layers=layers, dropout=dropout, bidirectional=bidirectional)

    epochs == iters == 0 && return model

    (ctrn,cdev,ctst) = collect(trn),collect(dev),collect.(tst)
    traindata = (epochs > 0
                 ? collect(flatten(shuffle!(ctrn) for i in 1:epochs))
                 : shuffle!(collect(take(cycle(ctrn), iters))))

    bestloss, bestmodel = loss(model, cdev), deepcopy(model)
    progress!(adam(model, traindata), seconds=seconds) do y
        devloss = loss(model, cdev)
        tstloss = map(d->loss(model,d), ctst)
        if devloss < bestloss
            bestloss, bestmodel = devloss, deepcopy(model)
        end
        println(stderr)
        (dev=devloss, tst=tstloss, mem=Float32(CuArrays.usage[]))
    end
    save && Knet.save("attn-$(Int(time_ns())).jld2", "model", bestmodel)
    bleu && Main.bleu(bestmodel,dev)
    return bestmodel
end

trainmodel (generic function with 1 method)

Train a model: If your implementation is correct, the first epoch should take about 24
minutes on a v100 and bring the loss from 9.83 to under 4.0. 10 epochs would take about 4
hours on a v100. With other GPUs you may have to use a smaller batch size (if memory is
lower) and longer time (if gpu speed is lower).

In [33]:
# Uncomment the appropriate option for training:
# model = pretrained  # Use reference model
# model = Knet.load("attn-1538395466294882.jld2", "model")  # Load pretrained model
model1 = trainmodel(dtrn,ddev,take(dtrn,20); epochs=10, save=true, bleu=true)  # Train model

(bidirectional, layers, hidden, srcembed, tgtembed, dropout, epochs, iters, bleu, save) = (true, 2, 512, 512, 512, 0.2, 10, 0, true, true)



┣                    ┫ [0.00%, 1/28120, 00:15/118:50:04, 15.21s/i] (dev = 9.83582f0, tst = (9.836524f0,), mem = 3.2419828f10)
┣▏                   ┫ [0.97%, 274/28120, 01:19/02:14:47, 4.29i/s] (dev = 5.412611f0, tst = (5.4172993f0,), mem = 3.302986f10)
┣▍                   ┫ [2.09%, 589/28120, 02:22/01:53:23, 4.95i/s] (dev = 4.998573f0, tst = (5.0214133f0,), mem = 3.1957395f10)
┣▋                   ┫ [3.21%, 902/28120, 03:26/01:47:09, 4.91i/s] (dev = 4.8109946f0, tst = (4.8308487f0,), mem = 3.308382f10)
┣▊                   ┫ [4.30%, 1210/28120, 04:30/01:44:32, 4.84i/s] (dev = 4.6728315f0, tst = (4.677597f0,), mem = 3.2713085f10)
┣█                   ┫ [5.39%, 1515/28120, 05:34/01:43:14, 4.78i/s] (dev = 4.5607166f0, tst = (4.5677443f0,), mem = 3.2973533f10)
┣█▎                  ┫ [6.55%, 1843/28120, 06:37/01:41:03, 5.15i/s] (dev = 4.439365f0, tst = (4.4767203f0,), mem = 3.1693791f10)
┣█▌                  ┫ [7.62%, 2144/28120, 07:41/01:40:49, 4.72i/s] (dev = 4.3662596f0, tst = (4.31489

BLEU = 16.31, 48.3/21.5/11.2/6.1 (BP=1.000, ratio=1.013, hyp_len=83535, ref_len=82502)


It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.


S2S(Embed(P(KnetArray{Float32,2}(512,38126))), LSTM(input=512,hidden=512,bidirectional,dropout=0.2), Memory(P(KnetArray{Float32,2}(512,1024))), Embed(P(KnetArray{Float32,2}(512,18857))), LSTM(input=1024,hidden=512,layers=2,dropout=0.2), Attention(1, P(KnetArray{Float32,2}(512,1536)), P(KnetArray{Float32,1}(1))), Linear(P(KnetArray{Float32,2}(18857,512)), P(KnetArray{Float32,1}(18857))), 0.2, Vocab(Dict("ağacından" => 35370,"komuta" => 13566,"ellisi" => 25239,"adresini" => 22820,"yüzeyi" => 4051,"paris'te" => 9494,"kafamdaki" => 18790,"yüzeyinde" => 5042,"geçerlidir" => 6612,"kökten" => 7774…), ["<s>", "<unk>", ".", ",", "bir", "ve", "bu", "''", "``", "için"  …  "seçmemiz", "destekleyip", "karşılaştırılabilir", "ördeğin", "gününüzü", "bağışçı", "istismara", "yaşça", "tedci", "fakültesi'nde"], 2, 1, split), Vocab(Dict("middle-income" => 13398,"photosynthesis" => 7689,"polarizing" => 17881,"henry" => 4248,"abducted" => 15691,"rises" => 6225,"hampshire" => 13888,"whiz" => 16835,"cost-benef

Code to sample translations from a dataset

In [34]:
data1 = MTData(tr_dev, en_dev, batchsize=1) |> collect;
function translate_sample(model, data)
    (src,tgt) = rand(data)
    out = model(src)
    println("SRC: ", int2str(src,model.srcvocab))
    println("REF: ", int2str(tgt,model.tgtvocab))
    println("OUT: ", int2str(out,model.tgtvocab))
end

translate_sample (generic function with 1 method)

Generate translations for random instances from the dev set

In [37]:
translate_sample(model1, data1)

SRC: ve sonra , bunun gibi olur — ( <unk> ! ) <unk> ... ( gülüşmeler ) eminim bu şeyin nasıl çalıştığını çözmeye çalışıyorsunuz .
REF: `` and then i 'll kind of be like — ( <unk> ! ) — and then they 're like , `` '' whoa ! '' '' ( laughter ) i 'm sure you 're trying to figure out , `` well , how does this thing work ? '' ''
OUT: and then , it 's like this — ( laughter ) — i 'm sure you 're trying to figure out how to figure out how this thing works .


Code to generate translations from user input

In [38]:
function translate_input(model)
    v = model.srcvocab
    src = [ get(v.w2i, w, v.unk) for w in v.tokenizer(readline()) ]'
    out = model(src)
    println("SRC: ", int2str(src,model.srcvocab))
    println("OUT: ", int2str(out,model.tgtvocab))
end

translate_input (generic function with 1 method)

Generate translations for user input

In [None]:
# translate_input(model1)

## Competition

The reference model `pretrained` has 16.2 bleu. By playing with the optimization algorithm
and hyperparameters, using per-sentence loss, and (most importantly) splitting the Turkish
words I was able to push the performance to 21.0 bleu. I will give extra credit to groups
that can exceed 21.0 bleu in this dataset.

*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*