In [1]:
using ProfileView
using Knet, AutoGrad
using Knet: sigm_dot, tanh_dot
# Check Task 8 notebook for profiling results Profile.init(delay=0.01)

Options

In [2]:
datafiles    = ["input.txt"]  # If provided, use first file for training, second for dev, others for test.
togenerate   = 500            # If non-zero generate given number of characters.
epochs       = 10             # Number of epochs for training.
hidden       = [128]          # Sizes of one or more LSTM layers.
embed        = 168            # Size of the embedding vector.
batchsize    = 128            # Number of sequences to train on in parallel
seqlength    = 20             # Maximum number of steps to unroll the network for bptt. Initial epochs will use the epoch number as bptt length for faster convergence.
seed         = -1             # Random number seed. -1 or 0 is no fixed seed
lr           = 1e-1           # Initial learning rate
gclip        = 3.0            # Value to clip the gradient norm at.
dpout        = 0.0            # Dropout probability.

0.0

In [3]:
seed > 0 && srand(seed)

# read text and report lengths
text = map(readstring, datafiles)
!isempty(text) && info("Chars read: $(map((f,c)->(basename(f),length(c)),datafiles,text))")

[1m[36mINFO: [39m[22m[36mChars read: Tuple{String,Int64}[("input.txt", 105989)]
[39m

# Task-1: Create dictionary by completing createVocabulary function
function createVocabulary takes text::Array{Any,1} that contains the names of datafiles you provided by opts[:datafiles] argument. It returns vocabulary::Dict{Char,Int}() for given text. In this lab, your text array is length of 1. For example the text is ["content of input"]. Note that for the sake of simplicity, we do NOT use validation or test dataset in this lab. You can try it by splitting your data into 3 different set after the lab.

In [4]:
function createVocabulary(text)
    vocab = Dict{Char,Int}()
    # MY CODE STARTS HERE 
    
    for (char_i,unique_character) in enumerate(unique(text[1]))
        vocab[Char(unique_character)] = char_i
    end
    # MY CODE ENDS HERE
    return vocab
end



createVocabulary (generic function with 1 method)

In [5]:
vocab = createVocabulary(text)
info("$(length(vocab)) unique chars.") # The output should be 75 unique chars for input.txt

[1m[36mINFO: [39m[22m[36m75 unique chars.
[39m

# LSTM Network function

In a regular RNN - The core idea is to use past hidden weights, present input to calculate the next set of hidden state weights. i.e st = (Ux/t + W/st-1) . 

LSTM equations looks scary and there are a lot of them forget gate, ingate , output gate and change . But it's basically just another way to calculate the hidden states (except in this scenario the vanishing gradients is not an issue). 

In a LSTM network you have initialize four sets of weights Wf (Forget gate ft) Wi - input gage, Wo - Output, Wc - Change gate weights and the corresponding biases. So instead of creating new variables looks like this cell code is just creating one giant weights/gates and then split it into parts of equal width(columnwise)

In [6]:
function lstm(weight,bias,hidden,cell,input)
    gates   = hcat(input,hidden) * weight .+ bias
    hsize   = size(hidden,2)
    forget  = sigm_dot(gates[:,1:hsize])
    ingate  = sigm_dot(gates[:,1+hsize:2hsize])
    outgate = sigm_dot(gates[:,1+2hsize:3hsize])
    change  = tanh_dot(gates[:,1+3hsize:end])
    cell    = cell .* forget + ingate .* change
    hidden  = outgate .* tanh_dot(cell)
    return (hidden,cell)
end

lstm (generic function with 1 method)

# Task-2: Create Initial weights
initweights creates the weights and biases for the model. We are using LSTM network. We provide init function(for weights) and bias function(for bias)

First we have to initialize weights from the placeholder here x=embed and embed being 168 looks like the input vector is 168 
long vector. The first part of this model is the relationshp betwen concatted inputs and hidden weights .This belongs to the embedding layer. Also for multiple hidden layers, y1=cell(x) , y2= cell(y1).  

In [7]:
function initweights(hidden, vocab, embed)
    init(d...) = xavier(d...)
    bias(d...) = zeros(d...)
    model = Vector{Any}(2*length(hidden)+3)
    X = embed
    for k = 1:length(hidden)
        # MY CODE STARTS HERE
        #Concatted input and hidden layer weights
        num_nodes_hidden = hidden[k]
        model[2k-1]=init(X+num_nodes_hidden,4*num_nodes_hidden) #Because we have to initialize 4w's - wf,wi,wo,wc
        model[2k]=bias(1,4*num_nodes_hidden)
        X = num_nodes_hidden
        # MY CODE ENDS HERE
    end
    model[end-2] = init(vocab,embed)
    model[end-1] = init(hidden[end],vocab)
    model[end] = bias(1,vocab)
    return model
end

initweights (generic function with 1 method)

# Task-3: Create Initial state
At each time step, we take the hidden state from previous time step as input. To be able to do that,first we need to initialize hidden state. We also store updated hidden states in array created here. We initialize state as a zero matrix.

In [8]:
let blank = nothing; global initstate
    function initstate(model, batch)
        nlayers = div(length(model)-3,2)
        state = Vector{Any}(2*nlayers)
        for k = 1:nlayers
            bias = model[2k]
            hidden = div(length(bias),4)
            if typeof(blank)!=typeof(bias) || size(blank)!=(batch,hidden)
                blank = fill!(similar(bias, batch, hidden),0)
            end
            state[2k-1] = state[2k] = blank
        end
        return state
    end
end

initstate (generic function with 1 method)

# Task-4: Create Predict function
predict is a function that takes w(model) created in initweights, s(state) created in initstate and input whose size is batchsize vocabulary You need to implement predict function for LSTM. You must use lstm function here. LSTM function is provided above.

In [9]:
function predict(model, state, input; pdrop=0)
    nlayers = div(length(model)-3,2)
    newstate = similar(state)
    for k = 1:nlayers
        # MY CODE STARTS HERE
        #newstate[2k-1] is the hidden layer (look at the initweights for more explanation)
        input = dropout(input, pdrop)
        (newstate[2k-1],newstate[2k])=lstm(model[2k-1],model[2k],state[2k-1],state[2k],input)
        input = newstate[2k-1]
        # MY CODE ENDS HERE
    end
    return input,newstate
end

predict (generic function with 1 method)

# Generate and Sample function
Generate function is a function we use to create some text that is similar to our training data. We provide sample function to you. You can predict the next character by using sample function once you calculate the probabilities given the input. index to char is the same dictionary as you created with createdictionary function but it works in the reverse direction. It gives you the character given the index.

In [10]:
function generate(model, tok2int, nchar)
    int2tok = Vector{Char}(length(tok2int))
    for (k,v) in tok2int; int2tok[v] = k; end
    input = tok2int[' ']
    state = initstate(model, 1)
    for t in 1:nchar
        embed = model[end-2][[input],:]
        ypred,state = predict(model,state,embed)
        ypred = ypred * model[end-1] .+ model[end]
        input = sample(exp.(logp(ypred)))
        print(int2tok[input])
    end
    println()
end

function sample(p)
    p = convert(Array,p)
    r = rand()
    for c = 1:length(p)
        r -= p[c]
        r < 0 && return c
    end
end

sample (generic function with 1 method)

# Now, Let's generate some random sample

In [11]:
model = initweights(hidden, length(vocab), embed)
state = initstate(model,1)

println("########## RANDOM MODEL OUTPUT ############")
generate(model, vocab, togenerate) ## change togenerate if you want longer sample text

########## RANDOM MODEL OUTPUT ############
9SdL2oK2qaxnd&!k3hhrq:Ph'KAbR&}mV;HqL5P\8r}CqRDwNu',odc&HnYVA-TKVaRM
5
5TcTc*IJrTBFfD vPU2ktKYH\aFzP-\-4YG{6iECRy4S\sw8*RFhggo8I:mxMm8S-sm.&w9265ka03FA
!*}:1rA2rJDwUsJ9D1Jax&riWh13cztPt;cnfhn? Q1VF3PvYnRIU 5KSl3e37k,BEG}a6\d.9BfE:e'!i:ATfgkGt
g-8?Jac5D}a\:IkAJ9B9fgt9ep&Fvkj-?8dNP44ffJ:fQ?EJL*91tmoxN&nI
R8.s-hS2\SEywliCn&-'8Do7:IfVLDQim?RLqrRB*yM5:W*Jbo95hSdVwpaUuKON}kAeEW9DLhM!JB\q3jMlj TWNd:HUUdH3yoY-!JiqA.58phrp&7E0MBac?Qt&3v-pkm
nG&HRnNr*HN0SHOJiiMOEcr58UGxu4:v4g0J8!9K?x
4nHxT,JU':YGUo-IRy2ou


We provide minibatch function for you. You do not have to do it for this lab. But we suggest you to understand the idea since you need to do it in your own project and future labs

In [12]:
function minibatch(chars, tok2int, batch_size)
    chars = collect(chars)
    nbatch = div(length(chars), batch_size)
    data = [zeros(Int,batch_size) for i=1:nbatch ]
    for n = 1:nbatch
        for b = 1:batch_size
            char = chars[(b-1)*nbatch + n]
            data[n][b] = tok2int[char]
        end
    end
    return data
end

minibatch (generic function with 1 method)

# Task-5: Create loss function

In [13]:
function loss(model, state, sequence, range=1:length(sequence)-1; newstate=nothing, pdrop=0)
    preds = []
    for t in range
        input = model[end-2][sequence[t],:]
        pred,state = predict(model,state,input; pdrop=pdrop)
        push!(preds,pred)
    end
    if newstate != nothing
        copy!(newstate, map(AutoGrad.getval,state))
    end
    pred0 = vcat(preds...)
    pred1 = dropout(pred0,pdrop)
    pred2 = pred1 * model[end-1]
    pred3 = pred2 .+ model[end]
    logp1 = logp(pred3,2)
    nrows,ncols = size(pred3)
    golds = vcat(sequence[range[1]+1:range[end]+1]...)
    index = similar(golds)
    @inbounds for i=1:length(golds)
        index[i] = i + (golds[i]-1)*nrows
    end
    logp2 = logp1[index]
    logp3 = sum(logp2)
    return -logp3 / length(golds)
end

# Knet magic
lossgradient = grad(loss)

function avgloss(model, sequence, S)
    T = length(sequence)
    B = length(sequence[1])
    state = initstate(model, B)
    total = count = 0
    for i in 1:S:T-1
        j = min(i+S-1,T-1)
        n = j-i+1
        total += n * loss(model, state, sequence, i:j; newstate=state)
        count += n
    end
    return total / count
end

avgloss (generic function with 1 method)

# Task-6: Create Train function¶
Implement bptt(Backpropagation through time) function for training. You need to fill up only 3 lines(or even small numbers). You need use lossgradient function and update! function.

In [14]:
function train(model, sequence, optim, S; pdrop=0)
    T = length(sequence)
    B = length(sequence[1])
    state = initstate(model, B)
    for i in 1:S:T-1
        # MY CODE STARTS HERE
        end_seq = 1+S-1
        if end_seq > T-1
            end_seq = T-1
        end
        gradient_loss = lossgradient(model,state,sequence,1:end_seq,newstate=state,pdrop=pdrop)
        update!(model,gradient_loss,optim)
        # MY CODE ENDS HERE
    end
end

train (generic function with 1 method)

# Now we are ready. First let's see the initial loss¶

In [15]:
data =  map(t->minibatch(t, vocab, batchsize), text)
# Print the loss of randomly initialized model.
losses = map(d->avgloss(model,d,100), data)
println((:epoch,0,:loss,losses...))

(:epoch, 0, :loss, 4.317637547848029)


# Below is the training part of RNN(with Adam)¶

In [16]:
optim = map(x->Adam(lr=lr, gclip=gclip), model)
# MAIN LOOP
function trainingloop()
    for epoch=1:epochs
        @time train(model, data[1], optim, min(epoch,seqlength); pdrop=dpout)
        # Calculate and print the losses after each epoch
        losses = map(d->avgloss(model,d,100),data)
        println((:epoch,epoch,:loss,losses...))
    end
end
trainingloop()
Profile.clear_malloc_data()
trainingloop()
# Profile.clear()
# @profile trainingloop()

 16.154698 seconds (3.76 M allocations: 13.204 GiB, 9.28% gc time)
(:epoch, 1, :loss, 6.4339138128636755)
 26.365740 seconds (1.82 M allocations: 10.573 GiB, 5.04% gc time)
(:epoch, 2, :loss, 16.779430804354813)
 27.217983 seconds (1.63 M allocations: 9.716 GiB, 5.92% gc time)
(:epoch, 3, :loss, 17.627699673150754)
 25.884419 seconds (1.56 M allocations: 9.288 GiB, 7.03% gc time)
(:epoch, 4, :loss, 15.686559137198309)
 30.129568 seconds (1.56 M allocations: 9.053 GiB, 18.97% gc time)
(:epoch, 5, :loss, 14.844173007429832)
 26.656316 seconds (1.56 M allocations: 8.861 GiB, 16.26% gc time)
(:epoch, 6, :loss, 13.489816137159679)
 28.251888 seconds (1.58 M allocations: 8.792 GiB, 21.83% gc time)
(:epoch, 7, :loss, 12.161539188062907)
 33.450087 seconds (1.58 M allocations: 8.689 GiB, 24.93% gc time)
(:epoch, 8, :loss, 10.425839088980421)
 32.550340 seconds (1.58 M allocations: 8.576 GiB, 26.77% gc time)
(:epoch, 9, :loss, 9.403993777537233)
 31.899722 seconds (1.58 M allocations: 8.540 GiB

LoadError: [91mUndefVarError: clear_malloc_data not defined[39m

# If you have checked the loss decreasing, let's create some text with our model

In [41]:
println("########## FINAL  MODEL OUTPUT ############")
state = initstate(model,1)
generate(model, vocab, togenerate)

########## FINAL  MODEL OUTPUT ############
Variables:
  #self#::#generate
  model::Array{Any,1}
  tok2int::Dict{Char,Int64}
  nchar::Int64
  k::Char
  v::Int64
  #temp#@_7::Int64
  #temp#@_8::Int64
  t::Int64
  embed[1m[91m::Any[39m[22m
  #temp#@_11::Int64
  ypred[1m[91m::Any[39m[22m
  #temp#@_13::Int64
  int2tok::Array{Char,1}
  input[1m[91m::Any[39m[22m
  state::Array{Any,1}
  i::Int64
  index::Int64
  #temp#@_19::Int64

Body:
  begin 
      NewvarNode(:(input[1m[91m::Any[39m[22m))
      NewvarNode(:(state::Array{Any,1}))
      SSAValue(11) = (Core.getfield)(tok2int::Dict{Char,Int64}, :count)::Int64
      int2tok::Array{Char,1} = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Char,1}, svec(Any, Int64), Array{Char,1}, 0, SSAValue(11), 0)) # line 3:
      $(Expr(:inbounds, false))
      # meta: location dict.jl start 574
      i::Int64 = $(Expr(:invoke, MethodInstance for skip_deleted(::Dict{Char,Int64}, ::Int64), :(Base.skip_deleted), :(tok2int), :((Core.getfield)(

In [37]:
# open("cpu_profile.bin", "w") do f serialize(f, Profile.retrieve()) end

before your program finished. To profile for longer runs, call Profile.init
with a larger buffer and/or larger delay.[39m
