In [1]:
#=
# This is an implementation of
# Glove: Global vectors for word representation
# J Pennington, R Socher, C Manning
# Proceedings of the 2014 conference on empirical methods in natural language
# https://nlp.stanford.edu/pubs/glove.pdf
# (Made from the paper without reference to the source code)
#
# Pennington et. al's implementation is ~1K lines of  C
#=#


LoadError: [91msyntax: incomplete: unterminated multi-line comment #= ... =#[39m

In [None]:
using CorpusLoaders
using MLDataUtils
using StringInterning
using DataStructures
using Optim
#using CatViews

In [None]:
wikidata = collect(Iterators.take(CorpusLoaders.load_wikicorpus(), 10_000_000))

In [None]:
function coocurs(data, hw=5)
    coocurs = DefaultDict{Tuple{InternedString,InternedString}, Float32}(0f0)
    distance_weights = [1f0/abs(d-hw) for d in 0:2hw  if d!=hw]
    for (word_, window) in slidingwindow(i->[i-hw:i-1; i+1:i+hw], data, 1, stride=1)
        word = first(word_)
        for (weight, coword) in zip(distance_weights, window)
            coocurs[(word,coword)]+=weight
        end
    end

    encoding = labelenc(last.(collect(keys(coocurs))))
    coocurs_mat = spzeros(Float32, nlabel(encoding), nlabel(encoding))
    for (coocurance, score) in coocurs
        inds = convertlabel.(LabelEnc.Indices(nlabel(encoding)), coocurance, encoding)
        coocurs_mat[inds...] = score
    end
    coocurs_mat, encoding
end

In [None]:
f(x, xmax=100f0, α=3/4)::Float32 = x>xmax ? 1f0 : (x/xmax)^α


In [None]:

function glove(data, ndim=300, halfwindow=5)
    xco, encoding = coocurs(data, halfwindow)
    # sum f.(xco)

    nwords = nlabel(encoding)

    params = Float32[]
    mm = Int[]
    mm(ii) = (ii-1)*(2*ndim+2)+1
    getw(params, i) = begin @inbounds x=@view params[mm(i) : mm(i)+ndim-1]; x end
    getv(params, i) = begin @inbounds x=@view params[mm(i)+ndim : mm(i)+2ndim-1]; x end
    getb(params, i) = begin @inbounds x=params[mm(i)+2ndim]; x end
    getc(params, i) = begin @inbounds x=params[mm(i)+2ndim+1]; x end

    for ii in 1:nwords
        vals = randn(Float32, 2ndim+2)# 2ndim+2)
        append!(params, vals)
        #Base.Test.@test vals == [getw(params,ii); getv(params,ii); getb(params,ii); getc(params,ii)]
    end

    xco_ijx = collect(zip(findnz(xco)...))
    
    function loss(params)
        loss = 0f0
        @inbounds for (i, j, x) in xco_ijx
            wi = getw(params, i)
            vj = getv(params, j)
            bi = getb(params, i)
            cj = getc(params, j)
            loss += f(x)*(wi⋅vj + bi + cj - log(x))^2
        end
        loss
    end

    
    optimize(loss, params; show_every=1, show_trace=true)
    getw.(params, 1:nlabels), encoding
end

In [None]:
wes, enc = glove(wikidata, 30)

In [None]:
wes