Code is based on [The following example](http://sujitpal.blogspot.nl/2016/04/predicting-movie-tags-from-plots-using.html)

In [13]:
import gensim
import json

In [148]:
def read_corpus(fname, tokens_only=False):
    
    file = '../data_resources/topics/{}.json'.format(fname)
    data = json.load(open(file))
    
    for obj in data:
        if tokens_only:
            yield gensim.utils.simple_preprocess(obj['content'])
        else:
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(obj['content']), [obj['main_topic']])

In [149]:
corpus = list(read_corpus('vng_training_cleaned_no_news'))

In [150]:
split = int(0.9*len(corpus))
train_corpus = corpus[0:split]
test_corpus = corpus[split:]

In [151]:
test_corpus[:1]

[TaggedDocument(words=['wilt', 'als', 'burgemeester', 'of', 'wethouder', 'meebepalen', 'over', 'de', 'strategie', 'van', 'ons', 'gemeentelijk', 'bestuurlijk', 'energienetwerk', 'en', 'zet', 'samen', 'met', 'ons', 'de', 'koers', 'uit', 'kortom', 'pakt', 'de', 'lokale', 'opgaven', 'bij', 'de', 'kop', 'meld', 'dan', 'aan', 'voor', 'de', 'energiek', 'decentraal', 'donderdag', 'mei', 'thema', 'tijdens', 'deze', 'bijeenkomst', 'geeft', 'met', 'uw', 'collega', 'bestuurders', 'en', 'de', 'genodigde', 'essentiële', 'partners', 'op', 'de', 'verschillende', 'thema', 'vorm', 'aan', 'de', 'strategische', 'lijnen', 'voor', 'de', 'lange', 'termijn', 'wat', 'is', 'de', 'toekomstagenda', 'van', 'gemeenten', 'op', 'het', 'thema', 'energie', 'waar', 'zit', 'de', 'gezamenlijkheid', 'en', 'de', 'wens', 'voor', 'versnelling', 'wat', 'betekent', 'dit', 'voor', 'de', 'ondersteunende', 'en', 'strategische', 'rol', 'en', 'de', 'agenda', 'van', 'de', 'vng', 'positie', 'versterken', 'het', 'doel', 'is', 'om', 'sa

In [152]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=100)

In [153]:
model.build_vocab(train_corpus)

In [154]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 1min 51s, sys: 1.53 s, total: 1min 52s
Wall time: 43.6 s


42024350

In [155]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
    
def tokenize_tags(label):
    tags = label.split("::")
    tags = map(lambda tok: mark_tag(tok), tags)
    return tags

def jaccard_similarity(labels, preds):
    lset = set(labels)
    pset = set(preds)
    return len(lset.intersection(pset)) / len(lset.union(pset))

def mark_tag(s):
    return "_" + s.replace(" ", "_")
    
def unmark_tag(s):
    return s[1:].replace("_", " ")

In [156]:
from random import shuffle
import numpy as np

In [157]:
# evaluate the model
tot_sim = 0.0
for test_document in test_corpus:
    pred_vec = model.infer_vector(test_document.words)
#     actual_tags = map(lambda x: unmark_tag(x), test_document.tags)
    pred_tags = model.docvecs.most_similar([pred_vec], topn=1)
    sim = jaccard_similarity(test_document.tags, [x[0] for x in pred_tags])
    tot_sim += sim
    
print("Average Similarity on Test Set: {}".format(tot_sim / len(test_corpus)))  

Average Similarity on Test Set: 0.42996742671009774


In [158]:
# print out random test result
for i in range(10):
    docid = np.random.randint(len(train_corpus))
    pred_vec = model.infer_vector(train_corpus[docid].words)
#     actual_tags = map(lambda x: unmark_tag(x), train_corpus[docid].tags)
    actual_tags = train_corpus[docid].tags
    pred_tags = model.docvecs.most_similar([pred_vec], topn=5)
    
    print("Actual tags: {}".format(", ".join(actual_tags)))
    print("Predicted tags: {}".format(pred_tags))
    print('--')

Actual tags: internationaal
Predicted tags: [('internationaal', 0.6653393507003784), ('europa', 0.545626163482666), ('economie', 0.5283413529396057), ('belastingen', 0.4950311779975891), ('recht', 0.4909015893936157)]
--
Actual tags: asiel
Predicted tags: [('asiel', 0.6957429647445679), ('openbare-gezondheid', 0.5622708797454834), ('gemeentefinancien', 0.4978879392147064), ('belastingen', 0.46325263381004333), ('sociaal-domein', 0.4590296447277069)]
--
Actual tags: arbeidsvoorwaarden-en-personeelsbeleid
Predicted tags: [('gemeentefinancien', 0.5479937195777893), ('recht', 0.5477897524833679), ('openbare-gezondheid', 0.5427666306495667), ('economie', 0.5147227644920349), ('werk-en-inkomen', 0.505023717880249)]
--
Actual tags: asiel
Predicted tags: [('openbare-gezondheid', 0.6164271831512451), ('economie', 0.5500040650367737), ('asiel', 0.5383455157279968), ('recht', 0.5342265963554382), ('gemeentefinancien', 0.524900496006012)]
--
Actual tags: milieu-en-mobiliteit
Predicted tags: [('ope