# IMDB Training Notebook

In [1]:
using Knet
# Hyperparams LSTM
EPOCHS=3
BATCHSIZE=64
EMBEDSIZE=125
NUMHIDDEN=100
LR=0.0001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
MAXLEN=150 #maximum size of the word sequence
MAXFEATURES=30000 #vocabulary size
DROPOUT=0.35
SEED=1311194
gpu(0)
atype = gpu()<0 ? Array{Float32}:KnetArray{Float32}

Knet.KnetArray{Float32,N} where N

In [2]:
#define model"
function initmodel()
    rnnSpec,rnnWeights = rnninit(EMBEDSIZE,NUMHIDDEN; rnnType=:lstm)
    inputMatrix = atype(xavier(Float32,EMBEDSIZE,MAXFEATURES))
    outputMatrix = atype(xavier(Float32,2,NUMHIDDEN))
    return rnnSpec,(rnnWeights,inputMatrix,outputMatrix)
end

initmodel (generic function with 1 method)

In [11]:
function savemodel(weights,rnnSpec;localfile="model_imdb.jld")
    save(localfile,"weights",weights,"rnnSpec",rnnSpec)
end

savemodel (generic function with 1 method)

In [4]:
# define loss and its gradient
function predict(weights, inputs, rnnSpec;train=false)
    rnnWeights, inputMatrix, outputMatrix = weights # (1,1,W), (X,V), (2,H)
    indices = hcat(inputs...)' # (B,T)
    rnnInput = inputMatrix[:,indices] # (X,B,T)
    if train
        rnnInput = dropout(rnnInput,DROPOUT)
    end
    rnnOutput = rnnforw(rnnSpec, rnnWeights, rnnInput)[1] # (H,B,T)
    if train
        rnnOutput = dropout(rnnOutput,DROPOUT)
    end
    return outputMatrix * rnnOutput[:,:,end] # (2,H) * (H,B) = (2,B)
end

loss(w,x,y,r;train=false)=nll(predict(w,x,r;train=train),y)
lossgradient = grad(loss);

In [5]:
# load data
include("imdb.jl")
@time (xtrn,ytrn,xtst,ytst,imdbdict)=imdb(maxlen=MAXLEN,maxval=MAXFEATURES,seed=SEED)
for d in (xtrn,ytrn,xtst,ytst); println(summary(d)); end
imdbarray = Array{String}(88584)
for (k,v) in imdbdict; imdbarray[v]=k; end

[1m[36mINFO: [39m[22m[36mDownloading IMDB...
[39m  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 16.6M  100 16.6M    0     0   758k      0  0:00:22  0:00:22 --:--:--  671k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1602k  100 1602k    0     0   819k      0  0:00:01  0:00:01 --:--:--  819k


 94.515819 seconds (27.42 M allocations: 1.331 GiB, 1.89% gc time)
25000-element Array{Array{Int32,1},1}
25000-element Array{Int8,1}
25000-element Array{Array{Int32,1},1}
25000-element Array{Int8,1}


[1m[36mINFO: [39m[22m[36mLoading IMDB...
[39m

In [6]:
rnd = rand(1:length(xtrn))
println("Sample review:\n",join(imdbarray[xtrn[rnd]]," "),"\n")
println("Classification: ",join(ytrn[rnd]))

Sample review:
reilly reilly reilly reilly reilly reilly reilly 5hrs to start with i have to point out the fact that you're gonna feel completely lost for more than half an hour yeah some things happen but you don't know why or what for when you finally figure things out you just realize that it's nothing but a twisted spastic opera dealing with mature prostitutes dead mothers illegitimate sons the characters are rather poor and the actors specially the young ones don't help that much to spastic look credible only marisa spastic stands out but she's a superb actress no matter if the movie is pure rubbish br br the only positive things to say about spastic sol de spastic is that spastic pablo spastic seems to have good intentions and he's filmed a couple of scenes that are quite intense well maybe the next time br br my rate 4 10

Classification: 1


In [7]:
# prepare for training
weights = nothing; knetgc(); # Reclaim memory from previous run
rnnSpec,weights = initmodel()
optim = optimizers(weights, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);

In [8]:
# 29s
info("Training...")
@time for epoch in 1:EPOCHS
    @time for (x,y) in minibatch(xtrn,ytrn,BATCHSIZE;shuffle=true)
        grads = lossgradient(weights,x,y,rnnSpec;train=true)
        update!(weights, grads, optim)
    end
end

[1m[36mINFO: [39m[22m[36mTraining...
[39m

 13.892884 seconds (2.06 M allocations: 136.613 MiB, 8.12% gc time)
  3.815176 seconds (401.21 k allocations: 46.601 MiB, 27.58% gc time)
  3.774615 seconds (401.94 k allocations: 46.613 MiB, 27.07% gc time)
 21.487306 seconds (2.87 M allocations: 229.977 MiB, 14.90% gc time)


In [9]:
info("Testing...")
@time accuracy(weights, minibatch(xtst,ytst,BATCHSIZE), (w,x)->predict(w,x,rnnSpec))

[1m[36mINFO: [39m[22m[36mTesting...
[39m

  3.328266 seconds (884.90 k allocations: 77.894 MiB, 2.24% gc time)


0.8670272435897436

In [12]:
savemodel(weights,rnnSpec)