In [1]:
# import Pkg; Pkg.add("CorpusLoaders")

In [2]:
using Knet, Random, Plots, IterTools
using Statistics: mean
using Base.Iterators: flatten
using Embeddings

In [3]:
const w2v = load_embeddings(Word2Vec)

Embeddings.EmbeddingTable{Array{Float32,2},Array{String,1}}(Float32[0.06731994 0.05295623 … -0.21142985 0.013637338; -0.05344657 0.06545979 … -0.0087888 -0.07428761; … ; -0.0073346887 0.010894641 … -0.0040515745 0.015611163; -0.0051456536 -0.047072206 … -0.034157887 0.039655942], ["</s>", "in", "for", "that", "is", "on", "##", "The", "with", "said"  …  "#-###-PA-PARKS", "Lackmeyer", "PERVEZ", "KUNDI", "Budhadeb", "Nautsch", "Antuane", "tricorne", "VISIONPAD", "RAFFAELE"])

In [4]:
S = 30 # max sentence length
# V = length(w2v.vocab) # vocab size
B = 50 # batch size
D = 100 # embedding size
C = 6 # number of classes
Ci = 1 # number of input channels
Ks = [3,4,5] # kernel_sizes
Co = 100 # output channels per kernel size
dp = 0.5; # dropout probability

In [5]:
const get_word_index = Dict(word=>ii for (ii,word) in enumerate(w2v.vocab));

# function get_embedding(word)
#     ind = get_word_index[word]
#     emb = w2v.embeddings[:,ind]
#     return emb
# end

In [6]:
yw2i = Dict{String,Int}()
unk = get_word_index["UNK"]
pad = get_word_index["PAD"]

31875

In [7]:
function readdata(trnf, tstf, S)
    ixtrn = []
    ixtst = []
    
    # read lines
    trnlines = readlines(trnf)
    
    # split label from sentence 
    ytrn = [split(i)[1] for i in trnlines]
    ytrn = [split(i, ":")[1] for i in ytrn]
    xtrn = [split(i)[2:end] for i in trnlines]
    
    # encode label
    ytrn = [get!(yw2i, w, 1+length(yw2i)) for w in ytrn]
    
    
    tstlines = readlines(tstf)
    ytst = [split(i)[1] for i in tstlines]
    ytst = [split(i, ":")[1] for i in ytst]
    ytst = [get(yw2i, w, 0) for w in ytst]
    xtst = [split(i)[2:end] for i in tstlines]
    
    # encode sentences and pad or truncate
    for (i, line) in enumerate(xtrn)
        words = [get(get_word_index, w, unk) for w in line]
        # pad or truncate
        if length(words) >= S
            words = words[1:S]
        else
            words = [words; repeat([pad], S-length(words))]
        end
        push!(ixtrn, words)
    end
    
    for (i, line) in enumerate(xtst)
        words = [get(get_word_index, w, unk) for w in line]
        # pad or truncate
        if length(words) >= S
            words = words[1:S]
        else
            words = [words; repeat([pad], S-length(words))]
        end
        push!(ixtst, words)
    end
    
    # concatenate to one big matrix
    ixtrn, ixtst = hcat(ixtrn...), hcat(ixtst...)
#     @show size(ixtst)
    
    # reduce the size of the 
    unq = [] # unique words in the training and test sets
    push!(unq, unique(ixtrn)...)
    push!(unq, unique(ixtst)...)
    unq = unique(unq)
    sort!(unq)
#     @show size(unq)
    trans = Dict(j => i for (i,j) in enumerate(unq)) 
    ixtrn = [get(trans, i, 0) for i in ixtrn]
#     @show size(ixtrn)
    ixtst = [get(trans, i, 0) for i in ixtst]
    embeddings = KnetArray(w2v.embeddings[:,unq])
    global V = size(embeddings, 2)
    # create iterators
    dtrn, dtst = minibatch(ixtrn, ytrn, B, shuffle=true), minibatch(ixtst, ytst, B, shuffle=true)
    return dtrn, dtst, embeddings
end

readdata (generic function with 1 method)

In [8]:
dtrn, dtst, embedding = readdata("./TREC/train_5500.label.txt", "./TREC/TREC_10.label.txt", S)

(Knet.Data{Tuple{Array{Int64,2},Array{Int64,1}}}([988 423 … 423 423; 88 1788 … 4 1699; … ; 6647 6647 … 6647 6647; 6647 6647 … 6647 6647], [1 2 … 5 2], 50, 5452, false, 5403, 1:5452, true, (30, 5452), (5452,), Array{Int64,2}, Array{Int64,1}), Knet.Data{Tuple{Array{Int64,2},Array{Int64,1}}}([988 423 … 423 423; 313 787 … 4 4; … ; 6647 6647 … 6647 6647; 6647 6647 … 6647 6647], [5 6 … 2 1], 50, 500, false, 451, 1:500, true, (30, 500), (500,), Array{Int64,2}, Array{Int64,1}), K32(300,8970)[0.05295623⋯])

In [9]:
# Embed layer 
struct Embed; w; w2; end
function (l::Embed)(x)
    x1 = reshape(l.w[:,x], (size(l.w,1),size(x,1),1,size(x,2))) # E,S,Cx,B
    x2 = reshape(l.w2[:,x], (size(l.w2,1),size(x,1),1,size(x,2))) # E,S,Cx,B
    return x1, x2
end

In [10]:
struct Dense; w; b; f; p; end
(d::Dense)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 4-D tensor to 2-D matrix so we can use matmul
Dense(i::Int,o::Int,f=relu;pdrop=0) = Dense(param(o,i), param0(o), f, pdrop)

Dense

In [11]:
struct Conv; w; b; f; p; end
function (c::Conv)(x)
    conved = conv4(c.w, dropout(x,c.p)) .+ c.b
    return c.f.(pool(conved; window=(size(conved, 1), size(conved, 2))))
end
Conv(w1::Int,w2::Int,cx::Int,cy::Int,f=relu;pdrop=0) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f, pdrop)

Conv

In [12]:
struct cnntext
    V # vocab size
    D # embedding size
    C # number of classes
    Ci # one
    Ks # kernel_sizes
    Co # number of each of the sizes
    dp # dropout probability
    embed::Embed # embedding layer
    fc::Dense 
    conv1::Conv
    conv2::Conv
    conv3::Conv
end

function (c::cnntext)(x)
    x, xc = c.embed(x)  # E, S, Cx (1), B
    x1, x2, x3 = c.conv1(x), c.conv2(x), c.conv3(x) 
    x4, x5, x6 = c.conv1(xc), c.conv2(xc), c.conv3(xc)
    x1 = x1 + x4
    x2 = x2 + x5
    x3 = x3 + x6
    x = cat(x1,x2,x3, dims=1)
    x = c.fc(x)
end

(c::cnntext)(x,y) = nll(c(x),y)

(c::cnntext)(d::Knet.Data) = mean(c(x,y) for (x,y) in d)

function cnntext(V, D, C, Ci, Ks, Co, dp)  ##############
    cnntext(V, D, C, Ci, Ks, Co, dp, Embed(Param(copy(embedding)), embedding), Dense(length(Ks)*Co,C,identity,pdrop=dp), Conv(D,Ks[1],Ci,Co), Conv(D,Ks[3],Ci,Co), Conv(D,Ks[3],Ci,Co))
end

cnntext

In [13]:
model = cnntext(V, D, C, Ci, Ks, Co, dp)

cnntext(8970, 100, 6, 1, [3, 4, 5], 100, 0.5, Embed(P(KnetArray{Float32,2}(300,8970)), K32(300,8970)[0.05295623⋯]), Dense(P(KnetArray{Float32,2}(6,300)), P(KnetArray{Float32,1}(6)), identity, 0.5), Conv(P(KnetArray{Float32,4}(100,3,1,100)), P(KnetArray{Float32,4}(1,1,100,1)), NNlib.relu, 0), Conv(P(KnetArray{Float32,4}(100,5,1,100)), P(KnetArray{Float32,4}(1,1,100,1)), NNlib.relu, 0), Conv(P(KnetArray{Float32,4}(100,5,1,100)), P(KnetArray{Float32,4}(1,1,100,1)), NNlib.relu, 0))

In [14]:
first(dtrn)[1]

30×50 Array{Int64,2}:
 1935   423   988  1935   423   423  …   423   423   423   423   423   423
    4   169  1322  5383  3867    56        4     4     9     4    55     4
   10    10   123  7906  7457  7906       10    10    10  7906    40  7906
  401   467    10    19   139   108     1238   869  2917   320   281  7628
 7906  5284  4951   164  7906    88      160     2  7906  6200  7906  7906
 8782   956    15  7906  1466  6338  …   397  3221  5286    57  7906  6647
 7906  7906    58  7906  7906  8456     7906  7906  4885  5538  7735  6647
 6647  6647   287  4853   670  1921     7906  6647  6198  3075  7906  6647
 6647  6647  8477  8298  7906  7906     7906  6647   102  7906     1  6647
 6647  6647  8464  7906  6647  6647     3910  6647  7906  6647  5042  6647
 6647  6647  8385  4938  6647  6647  …  7906  6647  7906  6647  7906  6647
 6647  6647  7906  7906  6647  6647     6647  6647  6647  6647  6647  6647
 6647  6647  6647    19  6647  6647     6647  6647  6647  6647  6647  6647
   

In [15]:
model(first(dtrn)[1])

6×50 KnetArray{Float32,2}:
  0.0477929   0.0320577   0.0359198  …   0.0479496   0.0546228   0.0420403
  0.0332731   0.0322033   0.0251398      0.0390904   0.0385036   0.0352652
 -0.0889302  -0.0835455  -0.0874306     -0.0769347  -0.0957419  -0.0864299
 -0.0406678  -0.037506   -0.0393853     -0.0336487  -0.0266294  -0.0444744
 -0.035897   -0.0398759  -0.0347919     -0.0291215  -0.0336546  -0.0322867
  0.0492987   0.0432989   0.0459299  …   0.0430649   0.0657053   0.0451848

In [16]:
nepochs = 3

3

In [17]:
function trainresults(file,model; o...)
    if (print("Train from scratch? "); readline()[1]=='y')
        r = ((model(dtrn), model(dtst), accuracy(model,dtrn), accuracy(model,dtst))
             for x in takenth(progress(adam(model,ncycle(dtrn,nepochs))),length(dtrn)))
        r = reshape(collect(Float32,flatten(r)),(4,:))
        Knet.save(file,"results",r)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || return
        r = Knet.load(file,"results")
    end
    println(minimum(r,dims=2))
    return r
end

trainresults (generic function with 1 method)

In [18]:
cnn = trainresults("cnntextTREC.jld2", model);

Train from scratch? stdin> y


┣███████████████████▉┫ [100.00%, 327/327, 27:28/27:28, 5.04s/i] 


Float32[0.2219617; 0.33521825; 0.7293578; 0.742]


In [19]:
cnn

4×3 Array{Float32,2}:
 0.851961  0.434623  0.221962
 0.931804  0.503326  0.335218
 0.729358  0.863486  0.95156 
 0.742     0.806     0.894   

In [20]:
# This program has requested access to the data dependency word2vec 300d.
# which is not currently installed. It can be installed automatically, and you will not see this message again.

# Pretrained Word2Vec Word emeddings
# Website: https://code.google.com/archive/p/word2vec/
# Author: Mikolov et al.
# Year: 2013

# Pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.

# Paper:
#     Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.



# Do you want to download the dataset from https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz to "/home/saif/.julia/datadeps/word2vec 300d"?
# [y/n]
# stdin> y
# const w2v = load_embeddings(Word2Vec)

In [21]:
# l = length.(xtrn)

# using StatsBase;
# a = countmap(l);

# b = hcat([[key, val] for (key, val) in a]...)';

# sort(collect(a), by=x->x[2]);