# Sequence classification model for IMDB Sentiment Analysis

In [None]:
if ENV["HOME"] == "/mnt/juliabox"; Pkg.dir(path...)=joinpath("/home/jrun/.julia/v0.6",path...); end # juliabox fix

In [None]:
using Knet, JLD
ENV["COLUMNS"]=80                     # column width for array printing
gpu()

In [None]:
EPOCHS=3          # Number of 
BATCHSIZE=64      # Number of instances in a minibatch
EMBEDSIZE=125     # Word embedding size
NUMHIDDEN=100     # Hidden layer size
MAXLEN=150        # maximum size of the word sequence, pad shorter sequences, truncate longer ones
MAXFEATURES=30000 # maximum vocabulary size, keep the most frequent 30K, map the rest to UNK token
DROPOUT=0.2       # Dropout rate
LR=0.001          # Learning rate
BETA_1=0.9        # Adam optimization parameter
BETA_2=0.999      # Adam optimization parameter
EPS=1e-08         # Adam optimization parameter

In [None]:
include("imdb.jl")   # defines imdb loader and minibatcher
@doc imdb

In [None]:
@time (xtrn,ytrn,xtst,ytst,imdbdict)=imdb(maxlen=MAXLEN,maxval=MAXFEATURES);

In [None]:
for d in (xtrn,ytrn,xtst,ytst,imdbdict); println(summary(d)); end

In [None]:
xtrn

In [None]:
# Read the imdb dictionary and print the words
imdbvocab = Array{String}(length(imdbdict))
for (k,v) in imdbdict; imdbvocab[v]=k; end
map(a->imdbvocab[a], xtrn)

In [None]:
length.(xtrn)'

In [None]:
ytrn'

In [None]:
@doc minibatch

In [None]:
BATCHSIZE=100
data = minibatch(xtrn,ytrn,BATCHSIZE;shuffle=true)
length(data)

In [None]:
x,y = first(data)
map(summary, (x,y))

In [None]:
# model constructor
function initmodel()
    rnnSpec,rnnWeights = rnninit(EMBEDSIZE,NUMHIDDEN; rnnType=:gru)
    inputMatrix = KnetArray(xavier(Float32,EMBEDSIZE,MAXFEATURES))
    outputMatrix = KnetArray(xavier(Float32,2,NUMHIDDEN))
    weights = (rnnWeights,inputMatrix,outputMatrix)
    return rnnSpec,weights
end

r,w = initmodel()
map(summary, w)

In [None]:
# sequence classifier
function predict(weights, inputs, rnnSpec) # inputs are B=100 Int32 arrays of length T=150
    rnnWeights, inputMatrix, outputMatrix = weights # (1,1,W), (X,V), (2,H)
    indices = hcat(inputs...)' # (B,T)
    rnnInput = inputMatrix[:,indices] # (X,B,T)
    rnnOutput = rnnforw(rnnSpec, rnnWeights, rnnInput)[1] # (H,B,T)
    return outputMatrix * rnnOutput[:,:,end] # (2,H) * (H,B) = (2,B)
end

predict(w,x,r) |> Array  # output is 2x100 score matrix for positive/negative sentiment of 100 instances

In [None]:
# define loss and its gradient
loss(weights,inputs,outputs,rnnSpec)=nll(predict(weights,inputs,rnnSpec),outputs)
lossgradient = grad(loss)
loss(w,x,y,r)

In [None]:
# initial accuracy
accuracy(w, minibatch(xtst,ytst,BATCHSIZE), (w,x)->(global rnnSpec; predict(w,x,r)))

In [None]:
# train model
info("Training...")
weights = nothing; knetgc(); # Reclaim memory from previous run
if !isfile("imdbmodel.jld")
    rnnSpec,weights = initmodel()
    optim = optimizers(weights, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS)
    @time for epoch in 1:EPOCHS  # 29s
        @time for (x,y) in minibatch(xtrn,ytrn,BATCHSIZE;shuffle=true)
            grads = lossgradient(weights,x,y,rnnSpec)
            update!(weights, grads, optim)
        end
    end
    @save "imdbmodel.jld" rnnSpec weights
else
    @eval (@load "imdbmodel.jld")
end

In [None]:
# loss drops
loss(weights,x,y,rnnSpec)

In [None]:
# final accuracy
accuracy(weights, minibatch(xtst,ytst,BATCHSIZE), (w,x)->predict(w,x,rnnSpec))