In [1]:
# import Pkg; Pkg.add("CorpusLoaders")

In [1]:
using Knet, Random, Plots, IterTools
using Statistics: mean
using Base.Iterators: flatten
using Embeddings

In [3]:
const w2v = load_embeddings(Word2Vec)

Embeddings.EmbeddingTable{Array{Float32,2},Array{String,1}}(Float32[0.06731994 0.05295623 … -0.21142985 0.013637338; -0.05344657 0.06545979 … -0.0087888 -0.07428761; … ; -0.0073346887 0.010894641 … -0.0040515745 0.015611163; -0.0051456536 -0.047072206 … -0.034157887 0.039655942], ["</s>", "in", "for", "that", "is", "on", "##", "The", "with", "said"  …  "#-###-PA-PARKS", "Lackmeyer", "PERVEZ", "KUNDI", "Budhadeb", "Nautsch", "Antuane", "tricorne", "VISIONPAD", "RAFFAELE"])

In [19]:
S = 30 # max sentence length
V = length(w2v.vocab) # vocab size
B = 50 # batch size
D = 100 # embedding size
C = 6 # number of classes
Ci = 1 # number of input channels
Ks = [3,4,5] # kernel_sizes
Co = 100 # output channels per kernel size
dp = 0.5; # dropout probability

In [5]:
const get_word_index = Dict(word=>ii for (ii,word) in enumerate(w2v.vocab))

# function get_embedding(word)
#     ind = get_word_index[word]
#     emb = w2v.embeddings[:,ind]
#     return emb
# end

get_embedding (generic function with 1 method)

In [6]:
yw2i = Dict{String,Int}()
unk = get_word_index["UNK"]
pad = get_word_index["PAD"]

31875

In [7]:
# l = length.(xtrn)

# using StatsBase;
# a = countmap(l);

# b = hcat([[key, val] for (key, val) in a]...)';

# sort(collect(a), by=x->x[2]);

In [8]:
function readdata(trnf, tstf, S)
    ixtrn = []
    ixtst = []
    
    # read lines
    trnlines = readlines(trnf)
    
    # split label from sentence 
    ytrn = [split(i)[1] for i in trnlines]
    ytrn = [split(i, ":")[1] for i in ytrn]
    xtrn = [split(i)[2:end] for i in trnlines]
    
    # encode label
    ytrn = [get!(yw2i, w, 1+length(yw2i)) for w in ytrn]
    
    
    tstlines = readlines(tstf)
    ytst = [split(i)[1] for i in tstlines]
    ytst = [split(i, ":")[1] for i in ytst]
    ytst = [get!(yw2i, w, 1+length(yw2i)) for w in ytst]
    xtst = [split(i)[2:end] for i in tstlines]
    
    # encode sentences and pad or truncate
    for (i, line) in enumerate(xtrn)
        words = [get(get_word_index, w, unk) for w in line]
        # pad or truncate
        if length(words) >= S
            words = words[1:S]
        else
            words = [words; repeat([pad], S-length(words))]
        end
        push!(ixtrn, words)
    end
    
    for (i, line) in enumerate(xtst)
        words = [get(get_word_index, w, unk) for w in line]
        # pad or truncate
        if length(words) >= S
            words = words[1:S]
        else
            words = [words; repeat([pad], S-length(words))]
        end
        push!(ixtst, words)
    end
    
    # concatenate to one big matrix
    ixtrn, ixtst = hcat(ixtrn...), hcat(ixtst...)
    
    # create iterators
    dtrn, dtst = minibatch(ixtrn, ytrn, B, shuffle=true), minibatch(ixtst, ytst, B, shuffle=true)
    return dtrn, dtst
end

readdata (generic function with 1 method)

In [9]:
dtrn, dtst = readdata("./TREC/train_5500.label.txt", "./TREC/TREC_10.label.txt", S)

(Knet.Data{Tuple{Array{Int64,2},Array{Int64,1}}}([1186 469 … 469 469; 93 2461 … 5 2303; … ; 31875 31875 … 31875 31875; 31875 31875 … 31875 31875], [1 2 … 5 2], 50, 5452, false, 5403, 1:5452, true, (30, 5452), (5452,), Array{Int64,2}, Array{Int64,1}), Knet.Data{Tuple{Array{Int64,2},Array{Int64,1}}}([1186 469 … 469 469; 353 904 … 5 5; … ; 31875 31875 … 31875 31875; 31875 31875 … 31875 31875], [5 6 … 2 1], 50, 500, false, 451, 1:500, true, (30, 500), (500,), Array{Int64,2}, Array{Int64,1}))

In [10]:
# Embed layer # may load pretrained embeddings instead
struct Embed; w; end
(l::Embed)(x) = reshape(l.w[:,x], (size(l.w,1),size(x,1),1,size(x,2))) # E,S,Cx,B

In [11]:
struct Dense; w; b; f; p; end
(d::Dense)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 4-D tensor to 2-D matrix so we can use matmul
Dense(i::Int,o::Int,f=relu;pdrop=0) = Dense(param(o,i), param0(o), f, pdrop)

Dense

In [12]:
struct Conv; w; b; f; p; end
function (c::Conv)(x)
    conved = conv4(c.w, dropout(x,c.p)) .+ c.b
    return c.f.(pool(conved; window=(size(conved, 1), size(conved, 2))))
end
Conv(w1::Int,w2::Int,cx::Int,cy::Int,f=relu;pdrop=0) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f, pdrop)

Conv

In [13]:
struct cnntext
    V # vocab size
    D # embedding size
    C # number of classes
    Ci # one
    Ks # kernel_sizes
    Co # number of each of the sizes
    dp # dropout probability
    embed::Embed # embedding layer
    fc::Dense 
    conv1::Conv
    conv2::Conv
    conv3::Conv
end

function (c::cnntext)(x)
    x = c.embed(x)  # E, S, Cx (1), B
    x1, x2, x3 = c.conv1(x), c.conv2(x), c.conv3(x) # 
    x = cat(x1,x2,x3, dims=1)
    x = c.fc(x)
end

(c::cnntext)(x,y) = nll(c(x),y)

(c::cnntext)(d::Knet.Data) = mean(c(x,y) for (x,y) in d)

function cnntext(V, D, C, Ci, Ks, Co, dp)  ##############
    cnntext(V, D, C, Ci, Ks, Co, dp, Embed(KnetArray(w2v.embeddings)), Dense(length(Ks)*Co,C,identity,pdrop=dp), Conv(D,Ks[1],Ci,Co), Conv(D,Ks[3],Ci,Co), Conv(D,Ks[3],Ci,Co))
end

cnntext

In [14]:
model = cnntext(V, D, C, Ci, Ks, Co, dp)

cnntext(929022, 100, 6, 1, [3, 4, 5], 100, 0.5, Embed(K32(300,929022)[0.06731994⋯]), Dense(P(KnetArray{Float32,2}(6,300)), P(KnetArray{Float32,1}(6)), identity, 0.5), Conv(P(KnetArray{Float32,4}(100,3,1,100)), P(KnetArray{Float32,4}(1,1,100,1)), NNlib.relu, 0), Conv(P(KnetArray{Float32,4}(100,5,1,100)), P(KnetArray{Float32,4}(1,1,100,1)), NNlib.relu, 0), Conv(P(KnetArray{Float32,4}(100,5,1,100)), P(KnetArray{Float32,4}(1,1,100,1)), NNlib.relu, 0))

In [15]:
first(dtrn)[1]

30×50 Array{Int64,2}:
  1866   1186     469    305   3592  …    2735   1195    469   1186   1186
    20    132    3214     11     51          5  20376   1739    132    132
    86  41315    7602     12     21      19160     11    933     69   2837
   132   1141     263    390    751     279493  75252   4128      2    697
 27678  75252      12  75252  75252      75252   2226    172   8354     20
   985    933    7402  76413  22695  …   31875  75252  18135  75252     73
 75252    184   75252   4810      6      31875     84   2094  31875      2
 31875  75252  189872  12936     12      31875    199  75252  31875     12
 31875   4572   75252  75252   1476      31875  75252   6543  31875     80
 31875  75252   31875  31875   3404      31875  31875  16955  31875  75252
 31875    393   31875  31875      3  …   31875  31875      6  31875  31875
 31875  11641   31875  31875    127      31875  31875  75252  31875  31875
 31875  75252   31875  31875   1207      31875  31875  34068  31875  31875
   

In [16]:
model(first(dtrn)[1])

6×50 KnetArray{Float32,2}:
 -0.0213892   -0.024592    -0.0224811   …  -0.0250941   -0.0294992 
 -0.0699811   -0.0681097   -0.0691206      -0.0680566   -0.0708236 
  0.0112531    0.0101576    0.0159278       0.0149379    0.00856043
 -0.00319843  -0.00451462  -0.00596846     -0.00662423  -0.00621198
  0.00986023   0.00986065   0.00481109      0.00233057   0.00811242
  0.0760324    0.0596563    0.0725875   …   0.0703006    0.0674989 

In [17]:
nepochs = 2

2

In [18]:
function trainresults(file,model; o...)
    if (print("Train from scratch? "); readline()[1]=='y')
        r = ((model(dtrn), model(dtst), accuracy(model,dtrn), accuracy(model,dtst))
             for x in takenth(progress(adam(model,ncycle(dtrn,nepochs))),length(dtrn)))
        r = reshape(collect(Float32,flatten(r)),(4,:))
        Knet.save(file,"results",r)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || return
        r = Knet.load(file,"results")
    end
    println(minimum(r,dims=2))
    return r
end

trainresults (generic function with 1 method)

In [20]:
cnn = trainresults("cnntextTREC.jld2", model);

Train from scratch? stdin> y


┣███████████████████▉┫ [100.00%, 218/218, 09:41/09:41, 2.66s/i] 


Float32[0.9396954; 0.96873415; 0.5623853; 0.502]


In [21]:
cnn

4×2 Array{Float32,2}:
 1.30894   0.939695
 1.31543   0.968734
 0.562385  0.724037
 0.502     0.754   

In [23]:
# This program has requested access to the data dependency word2vec 300d.
# which is not currently installed. It can be installed automatically, and you will not see this message again.

# Pretrained Word2Vec Word emeddings
# Website: https://code.google.com/archive/p/word2vec/
# Author: Mikolov et al.
# Year: 2013

# Pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.

# Paper:
#     Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.



# Do you want to download the dataset from https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz to "/home/saif/.julia/datadeps/word2vec 300d"?
# [y/n]
# stdin> y
# const w2v = load_embeddings(Word2Vec)