# Character based RNN language model trained on Julia code
(c) Deniz Yuret, 2018. Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness.

In [1]:
using Pkg
for p in ("Knet","ProgressMeter")
    haskey(Pkg.installed(),p) || Pkg.add(p)
end

In [2]:
RNNTYPE = :lstm
BATCHSIZE = 64
SEQLENGTH = 64
INPUTSIZE = 512
VOCABSIZE = 128
HIDDENSIZE = 512
NUMLAYERS = 2
DROPOUT = 0.0
LR=0.001
BETA_1=0.9
BETA_2=0.999
EPS=1e-08
EPOCHS = 10
ENV["COLUMNS"]=92;

## Load and minibatch data

In [3]:
base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia")
text = ""
for (root,dirs,files) in walkdir(base)
    for f in files
        f[end-2:end] == ".jl" || continue
        text *= read(joinpath(root,f), String)
    end
    # println((root,length(files),all(f->contains(f,".jl"),files)))
end
length(text)

9131265

In [4]:
charcnt = Dict{Char,Int}()
for c in text; charcnt[c]=1+get(charcnt,c,0); end
chars = sort(collect(keys(charcnt)), by=(x->charcnt[x]), rev=true)
charid = Dict{Char,Int}()
for i=1:length(chars); charid[chars[i]]=i; end
hcat(chars, map(c->charcnt[c],chars))

3642×2 Array{Any,2}:
 ' '   1971836
 'e'    548012
 't'    477724
 'n'    343215
 'r'    338122
 'i'    329419
 's'    325865
 'a'    316561
 'o'    275999
 '\n'   265652
 'l'    203478
 ','    200306
 ')'    194094
 ⋮            
 'ה'         1
 '🍢'         1
 '𝗾'         1
 '𝔔'         1
 'É'         1
 '𝓟'         1
 '𝚿'         1
 '𝕨'         1
 'ɛ'         1
 'Χ'         1
 '🕙'         1
 'ℚ'         1

In [5]:
data = map(c->charid[c], collect(text))
data[data .> VOCABSIZE] .= VOCABSIZE
ntst = 1<<19
tst = data[1:ntst]
trn = data[1+ntst:end]
length.((data,trn,tst))

(9131265, 8606977, 524288)

In [6]:
# Print a sample
r = rand(1:(length(trn)-1000))
println(string(chars[trn[r:r+1000]]...)) 

abstract_call(iteratef, (), Any[Const(iteratef), itertype, statetype], vtypes, sv)
        stateordonet = widenconst(stateordonet)
    end
    if stateordonet === Nothing
        return ret
    end
    while valtype !== Any
        nounion = typesubtract(stateordonet, Nothing)
        if !isa(nounion, DataType) || !(nounion <: Tuple) || isvatuple(nounion) || length(nounion.parameters) != 2
            valtype = Any
            break
        end
        if nounion.parameters[1] <: valtype && nounion.parameters[2] <: statetype
            break
        end
        valtype = tmerge(valtype, nounion.parameters[1])
        statetype = tmerge(statetype, nounion.parameters[2])
        stateordonet = abstract_call(iteratef, (), Any[Const(iteratef), itertype, statetype], vtypes, sv)
        stateordonet = widenconst(stateordonet)
    end
    push!(ret, Vararg{valtype})
    return ret
end

# do apply(af, fargs...), where af is a function value
function abstract_apply(@nospecialize(aft), fargs::U

In [7]:
# Minibatch data
using Knet: minibatch
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(2101, 127)

In [8]:
summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)

("64×64 Array{Int64,2}", "64×64 Array{Int64,2}")

## Define and initialize model

In [9]:
using Knet: param, param0, RNN, dropout

In [10]:
struct CharLM; input; rnn; output; end

CharLM(vocab::Int,input::Int,hidden::Int; o...) = 
    CharLM(Embed(vocab,input), RNN(input,hidden; o...), Linear(hidden,vocab))

function (c::CharLM)(x; pdrop=0, hidden=nothing)
    x = c.input(x)                # (B,T)->(X,B,T)
    x = dropout(x, pdrop)
    x = c.rnn(x, hidden=hidden)   # (H,B,T)
    x = dropout(x, pdrop)
    x = reshape(x, size(x,1), :)  # (H,B*T)
    return c.output(x)            # (V,B*T)
end

In [11]:
struct Embed; w; end

Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))

(e::Embed)(x) = e.w[:,x]

In [12]:
struct Linear; w; b; end

Linear(input::Int, output::Int)=Linear(param(output,input), param0(output))

(l::Linear)(x) = l.w * x .+ l.b

In [13]:
# For running experiments
using Knet; import ProgressMeter
function converge(alpha = 0.001)
    avgx = Inf
    avgp = -0.1
    updates = 0
    prog = ProgressMeter.ProgressThresh(-1.0, 2.0)                                                                                      
    function callback(x)
        updates += 1
        x = value(x)
        if avgx == Inf; avgx = x; end
        p = x - avgx
        avgx = alpha * x + (1-alpha) * avgx
        avgp = alpha * p + (1-alpha) * avgp
        #@show avgp,avgx
        ProgressMeter.update!(prog,avgx; showvalues=[(:updates,updates),])                                                                                                             
        return avgp <= 0.0
    end
    return callback
end
function trainresults(file,model,chars)
    if (print("Train from scratch? ");readline()[1]=='y')
        opt = Adam(lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS)
        callback = converge()
        train!(model, dtrn; callback=callback, optimizer=opt, pdrop=DROPOUT, hidden=[])
        Knet.gc(); Knet.save(file,"model",model,"chars",chars)
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        model,chars = Knet.load(file,"model","chars")
    end
    return model,chars
end

trainresults (generic function with 1 method)

In [14]:
clm,chars = trainresults("juliacharlm.jld2", 
    CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS, dropout=DROPOUT),chars);

Train from scratch? stdin> n


In [15]:
using Knet: nll
exp(nll(clm,dtst))  # Perplexity

4.802427f0

In [16]:
# Sample from trained model

function generate(model,chars,n)
    function sample(y)
        p = Array(exp.(y)); r = rand()*sum(p)
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    x = 1
    h = []
    for i=1:n
        y = model([x], hidden=h)
        x = sample(y)
        print(chars[x])
    end
    println()
end;

In [17]:
generate(clm,chars,1000)

           = 1^34
        # log full part and a flat in the off cardableing the task to be finding total flower here, we don't wrap for binary first
        # but implement dimension of Each -c FailedMutable of duest
        data = Bidx
        for i in base
        end
        stelms = start_diff[start]
        for i = line_inferred(mt.lastparsezeris(buffer))
            return linfo
        end
        bittest(io) = false
    end
    return one1
end

function doubly_declared_floor(x::Float64, i::Float32, z::Float64, Float64, s)
    x = BigFloat(y) # for Unicode cycle
    # A UnitRanges numbers (upon multiple, how the convert to undef a core.neg*r.r): b
    # least real zero if the sqrt integral the one made number v, a, but internal that i 100-24-big22, and the point rounds
    w = f - mts[r.r]
    if y !== z
        # TODO: returns w throw is not representable
        # put must match data has time conflicting
        symsp = rand(rt, bt[r[], 0)]
        c <= s8 || break
    end

  